Merge pull request #1199 from hashicorp/f-lighter-blocked-evals
Reuse blocked evaluations and handle unblock events that occurred during scheduling
This commit is contained in:
commit
52a78c5930
|
@ -35,15 +35,21 @@ type BlockedEvals struct {
|
||||||
escaped map[string]*structs.Evaluation
|
escaped map[string]*structs.Evaluation
|
||||||
|
|
||||||
// unblockCh is used to buffer unblocking of evaluations.
|
// unblockCh is used to buffer unblocking of evaluations.
|
||||||
capacityChangeCh chan string
|
capacityChangeCh chan *capacityUpdate
|
||||||
|
|
||||||
// jobs is the map of blocked job and is used to ensure that only one
|
// jobs is the map of blocked job and is used to ensure that only one
|
||||||
// blocked eval exists for each job.
|
// blocked eval exists for each job.
|
||||||
jobs map[string]struct{}
|
jobs map[string]struct{}
|
||||||
|
|
||||||
|
// unblockIndexes maps computed node classes to the index in which they were
|
||||||
|
// unblocked. This is used to check if an evaluation could have been
|
||||||
|
// unblocked between the time they were in the scheduler and the time they
|
||||||
|
// are being blocked.
|
||||||
|
unblockIndexes map[string]uint64
|
||||||
|
|
||||||
// duplicates is the set of evaluations for jobs that had pre-existing
|
// duplicates is the set of evaluations for jobs that had pre-existing
|
||||||
// blocked evaluations. These should be marked as cancelled since only one
|
// blocked evaluations. These should be marked as cancelled since only one
|
||||||
// blocked eval is neeeded bper job.
|
// blocked eval is neeeded per job.
|
||||||
duplicates []*structs.Evaluation
|
duplicates []*structs.Evaluation
|
||||||
|
|
||||||
// duplicateCh is used to signal that a duplicate eval was added to the
|
// duplicateCh is used to signal that a duplicate eval was added to the
|
||||||
|
@ -55,6 +61,12 @@ type BlockedEvals struct {
|
||||||
stopCh chan struct{}
|
stopCh chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// capacityUpdate stores unblock data.
|
||||||
|
type capacityUpdate struct {
|
||||||
|
computedClass string
|
||||||
|
index uint64
|
||||||
|
}
|
||||||
|
|
||||||
// BlockedStats returns all the stats about the blocked eval tracker.
|
// BlockedStats returns all the stats about the blocked eval tracker.
|
||||||
type BlockedStats struct {
|
type BlockedStats struct {
|
||||||
// TotalEscaped is the total number of blocked evaluations that have escaped
|
// TotalEscaped is the total number of blocked evaluations that have escaped
|
||||||
|
@ -73,7 +85,8 @@ func NewBlockedEvals(evalBroker *EvalBroker) *BlockedEvals {
|
||||||
captured: make(map[string]*structs.Evaluation),
|
captured: make(map[string]*structs.Evaluation),
|
||||||
escaped: make(map[string]*structs.Evaluation),
|
escaped: make(map[string]*structs.Evaluation),
|
||||||
jobs: make(map[string]struct{}),
|
jobs: make(map[string]struct{}),
|
||||||
capacityChangeCh: make(chan string, unblockBuffer),
|
unblockIndexes: make(map[string]uint64),
|
||||||
|
capacityChangeCh: make(chan *capacityUpdate, unblockBuffer),
|
||||||
duplicateCh: make(chan struct{}, 1),
|
duplicateCh: make(chan struct{}, 1),
|
||||||
stopCh: make(chan struct{}),
|
stopCh: make(chan struct{}),
|
||||||
stats: new(BlockedStats),
|
stats: new(BlockedStats),
|
||||||
|
@ -133,6 +146,16 @@ func (b *BlockedEvals) Block(eval *structs.Evaluation) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if the eval missed an unblock while it was in the scheduler at an
|
||||||
|
// older index. The scheduler could have been invoked with a snapshot of
|
||||||
|
// state that was prior to additional capacity being added or allocations
|
||||||
|
// becoming terminal.
|
||||||
|
if b.missedUnblock(eval) {
|
||||||
|
// Just re-enqueue the eval immediately
|
||||||
|
b.evalBroker.Enqueue(eval)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Mark the job as tracked.
|
// Mark the job as tracked.
|
||||||
b.stats.TotalBlocked++
|
b.stats.TotalBlocked++
|
||||||
b.jobs[eval.JobID] = struct{}{}
|
b.jobs[eval.JobID] = struct{}{}
|
||||||
|
@ -152,16 +175,65 @@ func (b *BlockedEvals) Block(eval *structs.Evaluation) {
|
||||||
b.captured[eval.ID] = eval
|
b.captured[eval.ID] = eval
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// missedUnblock returns whether an evaluation missed an unblock while it was in
|
||||||
|
// the scheduler. Since the scheduler can operate at an index in the past, the
|
||||||
|
// evaluation may have been processed missing data that would allow it to
|
||||||
|
// complete. This method returns if that is the case and should be called with
|
||||||
|
// the lock held.
|
||||||
|
func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool {
|
||||||
|
var max uint64 = 0
|
||||||
|
for class, index := range b.unblockIndexes {
|
||||||
|
// Calculate the max unblock index
|
||||||
|
if max < index {
|
||||||
|
max = index
|
||||||
|
}
|
||||||
|
|
||||||
|
elig, ok := eval.ClassEligibility[class]
|
||||||
|
if !ok {
|
||||||
|
// The evaluation was processed and did not encounter this class.
|
||||||
|
// Thus for correctness we need to unblock it.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// The evaluation could use the computed node class and the eval was
|
||||||
|
// processed before the last unblock.
|
||||||
|
if elig && eval.SnapshotIndex < index {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the evaluation has escaped, and the map contains an index older than
|
||||||
|
// the evaluations, it should be unblocked.
|
||||||
|
if eval.EscapedComputedClass && eval.SnapshotIndex < max {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// The evaluation is ahead of all recent unblocks.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// Unblock causes any evaluation that could potentially make progress on a
|
// Unblock causes any evaluation that could potentially make progress on a
|
||||||
// capacity change on the passed computed node class to be enqueued into the
|
// capacity change on the passed computed node class to be enqueued into the
|
||||||
// eval broker.
|
// eval broker.
|
||||||
func (b *BlockedEvals) Unblock(computedClass string) {
|
func (b *BlockedEvals) Unblock(computedClass string, index uint64) {
|
||||||
|
b.l.Lock()
|
||||||
|
|
||||||
// Do nothing if not enabled
|
// Do nothing if not enabled
|
||||||
if !b.enabled {
|
if !b.enabled {
|
||||||
|
b.l.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
b.capacityChangeCh <- computedClass
|
// Store the index in which the unblock happened. We use this on subsequent
|
||||||
|
// block calls in case the evaluation was in the scheduler when a trigger
|
||||||
|
// occured.
|
||||||
|
b.unblockIndexes[computedClass] = index
|
||||||
|
b.l.Unlock()
|
||||||
|
|
||||||
|
b.capacityChangeCh <- &capacityUpdate{
|
||||||
|
computedClass: computedClass,
|
||||||
|
index: index,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// watchCapacity is a long lived function that watches for capacity changes in
|
// watchCapacity is a long lived function that watches for capacity changes in
|
||||||
|
@ -171,15 +243,15 @@ func (b *BlockedEvals) watchCapacity() {
|
||||||
select {
|
select {
|
||||||
case <-b.stopCh:
|
case <-b.stopCh:
|
||||||
return
|
return
|
||||||
case computedClass := <-b.capacityChangeCh:
|
case update := <-b.capacityChangeCh:
|
||||||
b.unblock(computedClass)
|
b.unblock(update.computedClass, update.index)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// unblock unblocks all blocked evals that could run on the passed computed node
|
// unblock unblocks all blocked evals that could run on the passed computed node
|
||||||
// class.
|
// class.
|
||||||
func (b *BlockedEvals) unblock(computedClass string) {
|
func (b *BlockedEvals) unblock(computedClass string, index uint64) {
|
||||||
b.l.Lock()
|
b.l.Lock()
|
||||||
defer b.l.Unlock()
|
defer b.l.Unlock()
|
||||||
|
|
||||||
|
@ -229,6 +301,35 @@ func (b *BlockedEvals) unblock(computedClass string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UnblockFailed unblocks all blocked evaluation that were due to scheduler
|
||||||
|
// failure.
|
||||||
|
func (b *BlockedEvals) UnblockFailed() {
|
||||||
|
b.l.Lock()
|
||||||
|
defer b.l.Unlock()
|
||||||
|
|
||||||
|
// Do nothing if not enabled
|
||||||
|
if !b.enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var unblock []*structs.Evaluation
|
||||||
|
for id, eval := range b.captured {
|
||||||
|
if eval.TriggeredBy == structs.EvalTriggerMaxPlans {
|
||||||
|
unblock = append(unblock, eval)
|
||||||
|
delete(b.captured, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for id, eval := range b.escaped {
|
||||||
|
if eval.TriggeredBy == structs.EvalTriggerMaxPlans {
|
||||||
|
unblock = append(unblock, eval)
|
||||||
|
delete(b.escaped, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
b.evalBroker.EnqueueAll(unblock)
|
||||||
|
}
|
||||||
|
|
||||||
// GetDuplicates returns all the duplicate evaluations and blocks until the
|
// GetDuplicates returns all the duplicate evaluations and blocks until the
|
||||||
// passed timeout.
|
// passed timeout.
|
||||||
func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation {
|
func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation {
|
||||||
|
@ -273,7 +374,7 @@ func (b *BlockedEvals) Flush() {
|
||||||
b.escaped = make(map[string]*structs.Evaluation)
|
b.escaped = make(map[string]*structs.Evaluation)
|
||||||
b.jobs = make(map[string]struct{})
|
b.jobs = make(map[string]struct{})
|
||||||
b.duplicates = nil
|
b.duplicates = nil
|
||||||
b.capacityChangeCh = make(chan string, unblockBuffer)
|
b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer)
|
||||||
b.stopCh = make(chan struct{})
|
b.stopCh = make(chan struct{})
|
||||||
b.duplicateCh = make(chan struct{}, 1)
|
b.duplicateCh = make(chan struct{}, 1)
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,6 +53,27 @@ func TestBlockedEvals_Block_SameJob(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBlockedEvals_Block_PriorUnblocks(t *testing.T) {
|
||||||
|
blocked, _ := testBlockedEvals(t)
|
||||||
|
|
||||||
|
// Do unblocks prior to blocking
|
||||||
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
blocked.Unblock("v1:123", 1001)
|
||||||
|
|
||||||
|
// Create two blocked evals and add them to the blocked tracker.
|
||||||
|
e := mock.Eval()
|
||||||
|
e.Status = structs.EvalStatusBlocked
|
||||||
|
e.ClassEligibility = map[string]bool{"v1:123": false, "v1:456": false}
|
||||||
|
e.SnapshotIndex = 999
|
||||||
|
blocked.Block(e)
|
||||||
|
|
||||||
|
// Verify block did track both
|
||||||
|
bStats := blocked.Stats()
|
||||||
|
if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 {
|
||||||
|
t.Fatalf("bad: %#v", bStats)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBlockedEvals_GetDuplicates(t *testing.T) {
|
func TestBlockedEvals_GetDuplicates(t *testing.T) {
|
||||||
blocked, _ := testBlockedEvals(t)
|
blocked, _ := testBlockedEvals(t)
|
||||||
|
|
||||||
|
@ -105,7 +126,7 @@ func TestBlockedEvals_UnblockEscaped(t *testing.T) {
|
||||||
t.Fatalf("bad: %#v", bStats)
|
t.Fatalf("bad: %#v", bStats)
|
||||||
}
|
}
|
||||||
|
|
||||||
blocked.Unblock("v1:123")
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
testutil.WaitForResult(func() (bool, error) {
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
// Verify Unblock caused an enqueue
|
// Verify Unblock caused an enqueue
|
||||||
|
@ -141,7 +162,7 @@ func TestBlockedEvals_UnblockEligible(t *testing.T) {
|
||||||
t.Fatalf("bad: %#v", blockedStats)
|
t.Fatalf("bad: %#v", blockedStats)
|
||||||
}
|
}
|
||||||
|
|
||||||
blocked.Unblock("v1:123")
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
testutil.WaitForResult(func() (bool, error) {
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
// Verify Unblock caused an enqueue
|
// Verify Unblock caused an enqueue
|
||||||
|
@ -178,7 +199,7 @@ func TestBlockedEvals_UnblockIneligible(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should do nothing
|
// Should do nothing
|
||||||
blocked.Unblock("v1:123")
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
testutil.WaitForResult(func() (bool, error) {
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
// Verify Unblock didn't cause an enqueue
|
// Verify Unblock didn't cause an enqueue
|
||||||
|
@ -214,7 +235,7 @@ func TestBlockedEvals_UnblockUnknown(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should unblock because the eval hasn't seen this node class.
|
// Should unblock because the eval hasn't seen this node class.
|
||||||
blocked.Unblock("v1:789")
|
blocked.Unblock("v1:789", 1000)
|
||||||
|
|
||||||
testutil.WaitForResult(func() (bool, error) {
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
// Verify Unblock causes an enqueue
|
// Verify Unblock causes an enqueue
|
||||||
|
@ -233,3 +254,139 @@ func TestBlockedEvals_UnblockUnknown(t *testing.T) {
|
||||||
t.Fatalf("err: %s", err)
|
t.Fatalf("err: %s", err)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test the block case in which the eval should be immediately unblocked since
|
||||||
|
// it is escaped and old
|
||||||
|
func TestBlockedEvals_Block_ImmediateUnblock_Escaped(t *testing.T) {
|
||||||
|
blocked, broker := testBlockedEvals(t)
|
||||||
|
|
||||||
|
// Do an unblock prior to blocking
|
||||||
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
|
// Create a blocked eval that is eligible on a specific node class and add
|
||||||
|
// it to the blocked tracker.
|
||||||
|
e := mock.Eval()
|
||||||
|
e.Status = structs.EvalStatusBlocked
|
||||||
|
e.EscapedComputedClass = true
|
||||||
|
e.SnapshotIndex = 900
|
||||||
|
blocked.Block(e)
|
||||||
|
|
||||||
|
// Verify block caused the eval to be immediately unblocked
|
||||||
|
blockedStats := blocked.Stats()
|
||||||
|
if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 {
|
||||||
|
t.Fatalf("bad: %#v", blockedStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
// Verify Unblock caused an enqueue
|
||||||
|
brokerStats := broker.Stats()
|
||||||
|
if brokerStats.TotalReady != 1 {
|
||||||
|
return false, fmt.Errorf("bad: %#v", brokerStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test the block case in which the eval should be immediately unblocked since
|
||||||
|
// it there is an unblock on an unseen class
|
||||||
|
func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass(t *testing.T) {
|
||||||
|
blocked, broker := testBlockedEvals(t)
|
||||||
|
|
||||||
|
// Do an unblock prior to blocking
|
||||||
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
|
// Create a blocked eval that is eligible on a specific node class and add
|
||||||
|
// it to the blocked tracker.
|
||||||
|
e := mock.Eval()
|
||||||
|
e.Status = structs.EvalStatusBlocked
|
||||||
|
e.EscapedComputedClass = false
|
||||||
|
e.SnapshotIndex = 900
|
||||||
|
blocked.Block(e)
|
||||||
|
|
||||||
|
// Verify block caused the eval to be immediately unblocked
|
||||||
|
blockedStats := blocked.Stats()
|
||||||
|
if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 {
|
||||||
|
t.Fatalf("bad: %#v", blockedStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
// Verify Unblock caused an enqueue
|
||||||
|
brokerStats := broker.Stats()
|
||||||
|
if brokerStats.TotalReady != 1 {
|
||||||
|
return false, fmt.Errorf("bad: %#v", brokerStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test the block case in which the eval should be immediately unblocked since
|
||||||
|
// it a class it is eligible for has been unblocked
|
||||||
|
func TestBlockedEvals_Block_ImmediateUnblock_SeenClass(t *testing.T) {
|
||||||
|
blocked, broker := testBlockedEvals(t)
|
||||||
|
|
||||||
|
// Do an unblock prior to blocking
|
||||||
|
blocked.Unblock("v1:123", 1000)
|
||||||
|
|
||||||
|
// Create a blocked eval that is eligible on a specific node class and add
|
||||||
|
// it to the blocked tracker.
|
||||||
|
e := mock.Eval()
|
||||||
|
e.Status = structs.EvalStatusBlocked
|
||||||
|
e.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false}
|
||||||
|
e.SnapshotIndex = 900
|
||||||
|
blocked.Block(e)
|
||||||
|
|
||||||
|
// Verify block caused the eval to be immediately unblocked
|
||||||
|
blockedStats := blocked.Stats()
|
||||||
|
if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 {
|
||||||
|
t.Fatalf("bad: %#v", blockedStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
// Verify Unblock caused an enqueue
|
||||||
|
brokerStats := broker.Stats()
|
||||||
|
if brokerStats.TotalReady != 1 {
|
||||||
|
return false, fmt.Errorf("bad: %#v", brokerStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBlockedEvals_UnblockFailed(t *testing.T) {
|
||||||
|
blocked, broker := testBlockedEvals(t)
|
||||||
|
|
||||||
|
// Create blocked evals that are due to failures
|
||||||
|
e := mock.Eval()
|
||||||
|
e.Status = structs.EvalStatusBlocked
|
||||||
|
e.TriggeredBy = structs.EvalTriggerMaxPlans
|
||||||
|
e.EscapedComputedClass = true
|
||||||
|
blocked.Block(e)
|
||||||
|
|
||||||
|
e2 := mock.Eval()
|
||||||
|
e2.Status = structs.EvalStatusBlocked
|
||||||
|
e2.TriggeredBy = structs.EvalTriggerMaxPlans
|
||||||
|
e2.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false}
|
||||||
|
blocked.Block(e2)
|
||||||
|
|
||||||
|
// Trigger an unblock fail
|
||||||
|
blocked.UnblockFailed()
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
// Verify Unblock caused an enqueue
|
||||||
|
brokerStats := broker.Stats()
|
||||||
|
if brokerStats.TotalReady != 2 {
|
||||||
|
return false, fmt.Errorf("bad: %#v", brokerStats)
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
|
@ -202,6 +202,46 @@ func (e *Eval) Create(args *structs.EvalUpdateRequest,
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reblock is used to reinsert an existing blocked evaluation into the blocked
|
||||||
|
// evaluation tracker.
|
||||||
|
func (e *Eval) Reblock(args *structs.EvalUpdateRequest, reply *structs.GenericResponse) error {
|
||||||
|
if done, err := e.srv.forward("Eval.Reblock", args, args, reply); done {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer metrics.MeasureSince([]string{"nomad", "eval", "reblock"}, time.Now())
|
||||||
|
|
||||||
|
// Ensure there is only a single update with token
|
||||||
|
if len(args.Evals) != 1 {
|
||||||
|
return fmt.Errorf("only a single eval can be reblocked")
|
||||||
|
}
|
||||||
|
eval := args.Evals[0]
|
||||||
|
|
||||||
|
// Verify the evaluation is outstanding, and that the tokens match.
|
||||||
|
if err := e.srv.evalBroker.OutstandingReset(eval.ID, args.EvalToken); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for the eval
|
||||||
|
snap, err := e.srv.fsm.State().Snapshot()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
out, err := snap.EvalByID(eval.ID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if out == nil {
|
||||||
|
return fmt.Errorf("evaluation does not exist")
|
||||||
|
}
|
||||||
|
if out.Status != structs.EvalStatusBlocked {
|
||||||
|
return fmt.Errorf("evaluation not blocked")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reblock the eval
|
||||||
|
e.srv.blockedEvals.Block(eval)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Reap is used to cleanup dead evaluations and allocations
|
// Reap is used to cleanup dead evaluations and allocations
|
||||||
func (e *Eval) Reap(args *structs.EvalDeleteRequest,
|
func (e *Eval) Reap(args *structs.EvalDeleteRequest,
|
||||||
reply *structs.GenericResponse) error {
|
reply *structs.GenericResponse) error {
|
||||||
|
|
|
@ -575,3 +575,121 @@ func TestEvalEndpoint_Allocations_Blocking(t *testing.T) {
|
||||||
t.Fatalf("bad: %#v", resp.Allocations)
|
t.Fatalf("bad: %#v", resp.Allocations)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEvalEndpoint_Reblock_NonExistent(t *testing.T) {
|
||||||
|
s1 := testServer(t, nil)
|
||||||
|
defer s1.Shutdown()
|
||||||
|
codec := rpcClient(t, s1)
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
return s1.evalBroker.Enabled(), nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("should enable eval broker")
|
||||||
|
})
|
||||||
|
|
||||||
|
// Create the register request
|
||||||
|
eval1 := mock.Eval()
|
||||||
|
s1.evalBroker.Enqueue(eval1)
|
||||||
|
out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if out == nil {
|
||||||
|
t.Fatalf("missing eval")
|
||||||
|
}
|
||||||
|
|
||||||
|
get := &structs.EvalUpdateRequest{
|
||||||
|
Evals: []*structs.Evaluation{eval1},
|
||||||
|
EvalToken: token,
|
||||||
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
||||||
|
}
|
||||||
|
var resp structs.GenericResponse
|
||||||
|
if err := msgpackrpc.CallWithCodec(codec, "Eval.Reblock", get, &resp); err == nil {
|
||||||
|
t.Fatalf("expect error since eval does not exist")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvalEndpoint_Reblock_NonBlocked(t *testing.T) {
|
||||||
|
s1 := testServer(t, nil)
|
||||||
|
defer s1.Shutdown()
|
||||||
|
codec := rpcClient(t, s1)
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
return s1.evalBroker.Enabled(), nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("should enable eval broker")
|
||||||
|
})
|
||||||
|
|
||||||
|
// Create the eval
|
||||||
|
eval1 := mock.Eval()
|
||||||
|
s1.evalBroker.Enqueue(eval1)
|
||||||
|
|
||||||
|
// Insert it into the state store
|
||||||
|
if err := s1.fsm.State().UpsertEvals(1000, []*structs.Evaluation{eval1}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if out == nil {
|
||||||
|
t.Fatalf("missing eval")
|
||||||
|
}
|
||||||
|
|
||||||
|
get := &structs.EvalUpdateRequest{
|
||||||
|
Evals: []*structs.Evaluation{eval1},
|
||||||
|
EvalToken: token,
|
||||||
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
||||||
|
}
|
||||||
|
var resp structs.GenericResponse
|
||||||
|
if err := msgpackrpc.CallWithCodec(codec, "Eval.Reblock", get, &resp); err == nil {
|
||||||
|
t.Fatalf("should error since eval was not in blocked state", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvalEndpoint_Reblock(t *testing.T) {
|
||||||
|
s1 := testServer(t, nil)
|
||||||
|
defer s1.Shutdown()
|
||||||
|
codec := rpcClient(t, s1)
|
||||||
|
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
return s1.evalBroker.Enabled(), nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatalf("should enable eval broker")
|
||||||
|
})
|
||||||
|
|
||||||
|
// Create the eval
|
||||||
|
eval1 := mock.Eval()
|
||||||
|
eval1.Status = structs.EvalStatusBlocked
|
||||||
|
s1.evalBroker.Enqueue(eval1)
|
||||||
|
|
||||||
|
// Insert it into the state store
|
||||||
|
if err := s1.fsm.State().UpsertEvals(1000, []*structs.Evaluation{eval1}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if out == nil {
|
||||||
|
t.Fatalf("missing eval")
|
||||||
|
}
|
||||||
|
|
||||||
|
get := &structs.EvalUpdateRequest{
|
||||||
|
Evals: []*structs.Evaluation{eval1},
|
||||||
|
EvalToken: token,
|
||||||
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
||||||
|
}
|
||||||
|
var resp structs.GenericResponse
|
||||||
|
if err := msgpackrpc.CallWithCodec(codec, "Eval.Reblock", get, &resp); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that it is blocked
|
||||||
|
bStats := s1.blockedEvals.Stats()
|
||||||
|
if bStats.TotalBlocked+bStats.TotalEscaped == 0 {
|
||||||
|
t.Fatalf("ReblockEval didn't insert eval into the blocked eval tracker")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ func (n *nomadFSM) applyUpsertNode(buf []byte, index uint64) interface{} {
|
||||||
// Unblock evals for the nodes computed node class if it is in a ready
|
// Unblock evals for the nodes computed node class if it is in a ready
|
||||||
// state.
|
// state.
|
||||||
if req.Node.Status == structs.NodeStatusReady {
|
if req.Node.Status == structs.NodeStatusReady {
|
||||||
n.blockedEvals.Unblock(req.Node.ComputedClass)
|
n.blockedEvals.Unblock(req.Node.ComputedClass, index)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -199,7 +199,7 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} {
|
||||||
return err
|
return err
|
||||||
|
|
||||||
}
|
}
|
||||||
n.blockedEvals.Unblock(node.ComputedClass)
|
n.blockedEvals.Unblock(node.ComputedClass, index)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -420,7 +420,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
|
||||||
return err
|
return err
|
||||||
|
|
||||||
}
|
}
|
||||||
n.blockedEvals.Unblock(node.ComputedClass)
|
n.blockedEvals.Unblock(node.ComputedClass, index)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,13 @@ import (
|
||||||
"github.com/hashicorp/serf/serf"
|
"github.com/hashicorp/serf/serf"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// failedEvalUnblockInterval is the interval at which failed evaluations are
|
||||||
|
// unblocked to re-enter the scheduler. A failed evaluation occurs under
|
||||||
|
// high contention when the schedulers plan does not make progress.
|
||||||
|
failedEvalUnblockInterval = 1 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
// monitorLeadership is used to monitor if we acquire or lose our role
|
// monitorLeadership is used to monitor if we acquire or lose our role
|
||||||
// as the leader in the Raft cluster. There is some work the leader is
|
// as the leader in the Raft cluster. There is some work the leader is
|
||||||
// expected to do, so we must react to changes
|
// expected to do, so we must react to changes
|
||||||
|
@ -143,6 +150,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
|
||||||
// Reap any duplicate blocked evaluations
|
// Reap any duplicate blocked evaluations
|
||||||
go s.reapDupBlockedEvaluations(stopCh)
|
go s.reapDupBlockedEvaluations(stopCh)
|
||||||
|
|
||||||
|
// Periodically unblock failed allocations
|
||||||
|
go s.periodicUnblockFailedEvals(stopCh)
|
||||||
|
|
||||||
// Setup the heartbeat timers. This is done both when starting up or when
|
// Setup the heartbeat timers. This is done both when starting up or when
|
||||||
// a leader fail over happens. Since the timers are maintained by the leader
|
// a leader fail over happens. Since the timers are maintained by the leader
|
||||||
// node, effectively this means all the timers are renewed at the time of failover.
|
// node, effectively this means all the timers are renewed at the time of failover.
|
||||||
|
@ -341,6 +351,21 @@ func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
|
||||||
|
func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
|
||||||
|
ticker := time.NewTimer(failedEvalUnblockInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
// Unblock the failed allocations
|
||||||
|
s.blockedEvals.UnblockFailed()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// revokeLeadership is invoked once we step down as leader.
|
// revokeLeadership is invoked once we step down as leader.
|
||||||
// This is used to cleanup any state that may be specific to a leader.
|
// This is used to cleanup any state that may be specific to a leader.
|
||||||
func (s *Server) revokeLeadership() error {
|
func (s *Server) revokeLeadership() error {
|
||||||
|
|
|
@ -972,6 +972,32 @@ func (s *StateStore) Allocs() (memdb.ResultIterator, error) {
|
||||||
return iter, nil
|
return iter, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LastIndex returns the greatest index value for all indexes
|
||||||
|
func (s *StateStore) LatestIndex() (uint64, error) {
|
||||||
|
indexes, err := s.Indexes()
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var max uint64 = 0
|
||||||
|
for {
|
||||||
|
raw := indexes.Next()
|
||||||
|
if raw == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare the request struct
|
||||||
|
idx := raw.(*IndexEntry)
|
||||||
|
|
||||||
|
// Determine the max
|
||||||
|
if idx.Value > max {
|
||||||
|
max = idx.Value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return max, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Index finds the matching index value
|
// Index finds the matching index value
|
||||||
func (s *StateStore) Index(name string) (uint64, error) {
|
func (s *StateStore) Index(name string) (uint64, error) {
|
||||||
txn := s.db.Txn(false)
|
txn := s.db.Txn(false)
|
||||||
|
|
|
@ -1019,6 +1019,28 @@ func TestStateStore_Indexes(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestStateStore_LatestIndex(t *testing.T) {
|
||||||
|
state := testStateStore(t)
|
||||||
|
|
||||||
|
if err := state.UpsertNode(1000, mock.Node()); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
exp := uint64(2000)
|
||||||
|
if err := state.UpsertJob(exp, mock.Job()); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
latest, err := state.LatestIndex()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if latest != exp {
|
||||||
|
t.Fatalf("LatestIndex() returned %d; want %d", latest, exp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestStateStore_RestoreIndex(t *testing.T) {
|
func TestStateStore_RestoreIndex(t *testing.T) {
|
||||||
state := testStateStore(t)
|
state := testStateStore(t)
|
||||||
|
|
||||||
|
|
|
@ -2570,6 +2570,7 @@ const (
|
||||||
EvalTriggerNodeUpdate = "node-update"
|
EvalTriggerNodeUpdate = "node-update"
|
||||||
EvalTriggerScheduled = "scheduled"
|
EvalTriggerScheduled = "scheduled"
|
||||||
EvalTriggerRollingUpdate = "rolling-update"
|
EvalTriggerRollingUpdate = "rolling-update"
|
||||||
|
EvalTriggerMaxPlans = "max-plan-attempts"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -2671,6 +2672,11 @@ type Evaluation struct {
|
||||||
// during the evaluation. This should not be set during normal operations.
|
// during the evaluation. This should not be set during normal operations.
|
||||||
AnnotatePlan bool
|
AnnotatePlan bool
|
||||||
|
|
||||||
|
// SnapshotIndex is the Raft index of the snapshot used to process the
|
||||||
|
// evaluation. As such it will only be set once it has gone through the
|
||||||
|
// scheduler.
|
||||||
|
SnapshotIndex uint64
|
||||||
|
|
||||||
// Raft Indexes
|
// Raft Indexes
|
||||||
CreateIndex uint64
|
CreateIndex uint64
|
||||||
ModifyIndex uint64
|
ModifyIndex uint64
|
||||||
|
|
|
@ -59,6 +59,11 @@ type Worker struct {
|
||||||
failures uint
|
failures uint
|
||||||
|
|
||||||
evalToken string
|
evalToken string
|
||||||
|
|
||||||
|
// snapshotIndex is the index of the snapshot in which the scheduler was
|
||||||
|
// first envoked. It is used to mark the SnapshotIndex of evaluations
|
||||||
|
// Created, Updated or Reblocked.
|
||||||
|
snapshotIndex uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewWorker starts a new worker associated with the given server
|
// NewWorker starts a new worker associated with the given server
|
||||||
|
@ -241,6 +246,12 @@ func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
|
||||||
return fmt.Errorf("failed to snapshot state: %v", err)
|
return fmt.Errorf("failed to snapshot state: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Store the snapshot's index
|
||||||
|
w.snapshotIndex, err = snap.LatestIndex()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to determine snapshot's index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Create the scheduler, or use the special system scheduler
|
// Create the scheduler, or use the special system scheduler
|
||||||
var sched scheduler.Scheduler
|
var sched scheduler.Scheduler
|
||||||
if eval.Type == structs.JobTypeCore {
|
if eval.Type == structs.JobTypeCore {
|
||||||
|
@ -334,6 +345,9 @@ func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
|
||||||
}
|
}
|
||||||
defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
|
defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
|
||||||
|
|
||||||
|
// Store the snapshot index in the eval
|
||||||
|
eval.SnapshotIndex = w.snapshotIndex
|
||||||
|
|
||||||
// Setup the request
|
// Setup the request
|
||||||
req := structs.EvalUpdateRequest{
|
req := structs.EvalUpdateRequest{
|
||||||
Evals: []*structs.Evaluation{eval},
|
Evals: []*structs.Evaluation{eval},
|
||||||
|
@ -369,6 +383,9 @@ func (w *Worker) CreateEval(eval *structs.Evaluation) error {
|
||||||
}
|
}
|
||||||
defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
|
defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
|
||||||
|
|
||||||
|
// Store the snapshot index in the eval
|
||||||
|
eval.SnapshotIndex = w.snapshotIndex
|
||||||
|
|
||||||
// Setup the request
|
// Setup the request
|
||||||
req := structs.EvalUpdateRequest{
|
req := structs.EvalUpdateRequest{
|
||||||
Evals: []*structs.Evaluation{eval},
|
Evals: []*structs.Evaluation{eval},
|
||||||
|
@ -395,6 +412,44 @@ SUBMIT:
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ReblockEval is used to reinsert a blocked evaluation into the blocked eval
|
||||||
|
// tracker. This allows the worker to act as the planner for the scheduler.
|
||||||
|
func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
|
||||||
|
// Check for a shutdown before plan submission
|
||||||
|
if w.srv.IsShutdown() {
|
||||||
|
return fmt.Errorf("shutdown while planning")
|
||||||
|
}
|
||||||
|
defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
|
||||||
|
|
||||||
|
// Store the snapshot index in the eval
|
||||||
|
eval.SnapshotIndex = w.snapshotIndex
|
||||||
|
|
||||||
|
// Setup the request
|
||||||
|
req := structs.EvalUpdateRequest{
|
||||||
|
Evals: []*structs.Evaluation{eval},
|
||||||
|
EvalToken: w.evalToken,
|
||||||
|
WriteRequest: structs.WriteRequest{
|
||||||
|
Region: w.srv.config.Region,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
var resp structs.GenericResponse
|
||||||
|
|
||||||
|
SUBMIT:
|
||||||
|
// Make the RPC call
|
||||||
|
if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
|
||||||
|
w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v",
|
||||||
|
eval, err)
|
||||||
|
if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
|
||||||
|
goto SUBMIT
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
} else {
|
||||||
|
w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval)
|
||||||
|
w.backoffReset()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// shouldResubmit checks if a given error should be swallowed and the plan
|
// shouldResubmit checks if a given error should be swallowed and the plan
|
||||||
// resubmitted after a backoff. Usually these are transient errors that
|
// resubmitted after a backoff. Usually these are transient errors that
|
||||||
// the cluster should heal from quickly.
|
// the cluster should heal from quickly.
|
||||||
|
|
|
@ -398,6 +398,9 @@ func TestWorker_UpdateEval(t *testing.T) {
|
||||||
if out.Status != structs.EvalStatusComplete {
|
if out.Status != structs.EvalStatusComplete {
|
||||||
t.Fatalf("bad: %v", out)
|
t.Fatalf("bad: %v", out)
|
||||||
}
|
}
|
||||||
|
if out.SnapshotIndex != w.snapshotIndex {
|
||||||
|
t.Fatalf("bad: %v", out)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestWorker_CreateEval(t *testing.T) {
|
func TestWorker_CreateEval(t *testing.T) {
|
||||||
|
@ -441,4 +444,74 @@ func TestWorker_CreateEval(t *testing.T) {
|
||||||
if out.PreviousEval != eval1.ID {
|
if out.PreviousEval != eval1.ID {
|
||||||
t.Fatalf("bad: %v", out)
|
t.Fatalf("bad: %v", out)
|
||||||
}
|
}
|
||||||
|
if out.SnapshotIndex != w.snapshotIndex {
|
||||||
|
t.Fatalf("bad: %v", out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorker_ReblockEval(t *testing.T) {
|
||||||
|
s1 := testServer(t, func(c *Config) {
|
||||||
|
c.NumSchedulers = 0
|
||||||
|
c.EnabledSchedulers = []string{structs.JobTypeService}
|
||||||
|
})
|
||||||
|
defer s1.Shutdown()
|
||||||
|
testutil.WaitForLeader(t, s1.RPC)
|
||||||
|
|
||||||
|
// Create the blocked eval
|
||||||
|
eval1 := mock.Eval()
|
||||||
|
eval1.Status = structs.EvalStatusBlocked
|
||||||
|
|
||||||
|
// Insert it into the state store
|
||||||
|
if err := s1.fsm.State().UpsertEvals(1000, []*structs.Evaluation{eval1}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue the eval and then dequeue
|
||||||
|
s1.evalBroker.Enqueue(eval1)
|
||||||
|
evalOut, token, err := s1.evalBroker.Dequeue([]string{eval1.Type}, time.Second)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if evalOut != eval1 {
|
||||||
|
t.Fatalf("Bad eval")
|
||||||
|
}
|
||||||
|
|
||||||
|
eval2 := evalOut.Copy()
|
||||||
|
|
||||||
|
// Attempt to reblock eval
|
||||||
|
w := &Worker{srv: s1, logger: s1.logger, evalToken: token}
|
||||||
|
err = w.ReblockEval(eval2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ack the eval
|
||||||
|
w.sendAck(evalOut.ID, token, true)
|
||||||
|
|
||||||
|
// Check that it is blocked
|
||||||
|
bStats := s1.blockedEvals.Stats()
|
||||||
|
if bStats.TotalBlocked+bStats.TotalEscaped != 1 {
|
||||||
|
t.Fatalf("ReblockEval didn't insert eval into the blocked eval tracker: %#v", bStats)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that the snapshot index was set properly by unblocking the eval and
|
||||||
|
// then dequeuing.
|
||||||
|
s1.blockedEvals.Unblock("foobar", 1000)
|
||||||
|
|
||||||
|
reblockedEval, _, err := s1.evalBroker.Dequeue([]string{eval1.Type}, 1*time.Second)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if reblockedEval == nil {
|
||||||
|
t.Fatalf("Nil eval")
|
||||||
|
}
|
||||||
|
if reblockedEval.ID != eval1.ID {
|
||||||
|
t.Fatalf("Bad eval")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that the SnapshotIndex is set
|
||||||
|
if reblockedEval.SnapshotIndex != w.snapshotIndex {
|
||||||
|
t.Fatalf("incorrect snapshot index; got %d; want %d",
|
||||||
|
reblockedEval.SnapshotIndex, w.snapshotIndex)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,7 +114,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
|
||||||
// Scheduling was tried but made no forward progress so create a
|
// Scheduling was tried but made no forward progress so create a
|
||||||
// blocked eval to retry once resources become available.
|
// blocked eval to retry once resources become available.
|
||||||
var mErr multierror.Error
|
var mErr multierror.Error
|
||||||
if err := s.createBlockedEval(); err != nil {
|
if err := s.createBlockedEval(true); err != nil {
|
||||||
mErr.Errors = append(mErr.Errors, err)
|
mErr.Errors = append(mErr.Errors, err)
|
||||||
}
|
}
|
||||||
if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, statusErr.EvalStatus, err.Error()); err != nil {
|
if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, statusErr.EvalStatus, err.Error()); err != nil {
|
||||||
|
@ -125,12 +125,19 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the current evaluation is a blocked evaluation and we didn't place
|
||||||
|
// everything, do not update the status to complete.
|
||||||
|
if s.eval.Status == structs.EvalStatusBlocked && len(s.eval.FailedTGAllocs) != 0 {
|
||||||
|
return s.planner.ReblockEval(s.eval)
|
||||||
|
}
|
||||||
|
|
||||||
// Update the status to complete
|
// Update the status to complete
|
||||||
return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, structs.EvalStatusComplete, "")
|
return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, structs.EvalStatusComplete, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
// createBlockedEval creates a blocked eval and stores it.
|
// createBlockedEval creates a blocked eval and submits it to the planner. If
|
||||||
func (s *GenericScheduler) createBlockedEval() error {
|
// failure is set to true, the eval's trigger reason reflects that.
|
||||||
|
func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
|
||||||
e := s.ctx.Eligibility()
|
e := s.ctx.Eligibility()
|
||||||
escaped := e.HasEscaped()
|
escaped := e.HasEscaped()
|
||||||
|
|
||||||
|
@ -141,6 +148,10 @@ func (s *GenericScheduler) createBlockedEval() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped)
|
s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped)
|
||||||
|
if planFailure {
|
||||||
|
s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
|
||||||
|
}
|
||||||
|
|
||||||
return s.planner.CreateEval(s.blocked)
|
return s.planner.CreateEval(s.blocked)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,9 +188,10 @@ func (s *GenericScheduler) process() (bool, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there are failed allocations, we need to create a blocked evaluation
|
// If there are failed allocations, we need to create a blocked evaluation
|
||||||
// to place the failed allocations when resources become available.
|
// to place the failed allocations when resources become available. If the
|
||||||
if len(s.eval.FailedTGAllocs) != 0 && s.blocked == nil {
|
// current evaluation is already a blocked eval, we reuse it.
|
||||||
if err := s.createBlockedEval(); err != nil {
|
if s.eval.Status != structs.EvalStatusBlocked && len(s.eval.FailedTGAllocs) != 0 && s.blocked == nil {
|
||||||
|
if err := s.createBlockedEval(false); err != nil {
|
||||||
s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
|
s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -275,7 +275,7 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) {
|
||||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestServiceSched_JobRegister_BlockedEval(t *testing.T) {
|
func TestServiceSched_JobRegister_CreateBlockedEval(t *testing.T) {
|
||||||
h := NewHarness(t)
|
h := NewHarness(t)
|
||||||
|
|
||||||
// Create a full node
|
// Create a full node
|
||||||
|
@ -454,6 +454,126 @@ func TestServiceSched_JobRegister_FeasibleAndInfeasibleTG(t *testing.T) {
|
||||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestServiceSched_EvaluateBlockedEval(t *testing.T) {
|
||||||
|
h := NewHarness(t)
|
||||||
|
|
||||||
|
// Create a job and set the task group count to zero.
|
||||||
|
job := mock.Job()
|
||||||
|
noErr(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||||
|
|
||||||
|
// Create a mock blocked evaluation
|
||||||
|
eval := &structs.Evaluation{
|
||||||
|
ID: structs.GenerateUUID(),
|
||||||
|
Status: structs.EvalStatusBlocked,
|
||||||
|
Priority: job.Priority,
|
||||||
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
||||||
|
JobID: job.ID,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert it into the state store
|
||||||
|
noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||||
|
|
||||||
|
// Process the evaluation
|
||||||
|
err := h.Process(NewServiceScheduler, eval)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure there was no plan
|
||||||
|
if len(h.Plans) != 0 {
|
||||||
|
t.Fatalf("bad: %#v", h.Plans)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that the eval was reblocked
|
||||||
|
if len(h.ReblockEvals) != 1 {
|
||||||
|
t.Fatalf("bad: %#v", h.ReblockEvals)
|
||||||
|
}
|
||||||
|
if h.ReblockEvals[0].ID != eval.ID {
|
||||||
|
t.Fatalf("expect same eval to be reblocked; got %q; want %q", h.ReblockEvals[0].ID, eval.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the eval status was not updated
|
||||||
|
if len(h.Evals) != 0 {
|
||||||
|
t.Fatalf("Existing eval should not have status set")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServiceSched_EvaluateBlockedEval_Finished(t *testing.T) {
|
||||||
|
h := NewHarness(t)
|
||||||
|
|
||||||
|
// Create some nodes
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
node := mock.Node()
|
||||||
|
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a job and set the task group count to zero.
|
||||||
|
job := mock.Job()
|
||||||
|
noErr(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||||
|
|
||||||
|
// Create a mock blocked evaluation
|
||||||
|
eval := &structs.Evaluation{
|
||||||
|
ID: structs.GenerateUUID(),
|
||||||
|
Status: structs.EvalStatusBlocked,
|
||||||
|
Priority: job.Priority,
|
||||||
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
||||||
|
JobID: job.ID,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert it into the state store
|
||||||
|
noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||||
|
|
||||||
|
// Process the evaluation
|
||||||
|
err := h.Process(NewServiceScheduler, eval)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure a single plan
|
||||||
|
if len(h.Plans) != 1 {
|
||||||
|
t.Fatalf("bad: %#v", h.Plans)
|
||||||
|
}
|
||||||
|
plan := h.Plans[0]
|
||||||
|
|
||||||
|
// Ensure the plan doesn't have annotations.
|
||||||
|
if plan.Annotations != nil {
|
||||||
|
t.Fatalf("expected no annotations")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the eval has no spawned blocked eval
|
||||||
|
if len(h.Evals) != 1 {
|
||||||
|
t.Fatalf("bad: %#v", h.Evals)
|
||||||
|
if h.Evals[0].BlockedEval != "" {
|
||||||
|
t.Fatalf("bad: %#v", h.Evals[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the plan allocated
|
||||||
|
var planned []*structs.Allocation
|
||||||
|
for _, allocList := range plan.NodeAllocation {
|
||||||
|
planned = append(planned, allocList...)
|
||||||
|
}
|
||||||
|
if len(planned) != 10 {
|
||||||
|
t.Fatalf("bad: %#v", plan)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lookup the allocations by JobID
|
||||||
|
out, err := h.State.AllocsByJob(job.ID)
|
||||||
|
noErr(t, err)
|
||||||
|
|
||||||
|
// Ensure all allocations placed
|
||||||
|
if len(out) != 10 {
|
||||||
|
t.Fatalf("bad: %#v", out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the eval was not reblocked
|
||||||
|
if len(h.ReblockEvals) != 0 {
|
||||||
|
t.Fatalf("Existing eval should not have been reblocked as it placed all allocations")
|
||||||
|
}
|
||||||
|
|
||||||
|
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||||
|
}
|
||||||
|
|
||||||
func TestServiceSched_JobModify(t *testing.T) {
|
func TestServiceSched_JobModify(t *testing.T) {
|
||||||
h := NewHarness(t)
|
h := NewHarness(t)
|
||||||
|
|
||||||
|
|
|
@ -87,4 +87,10 @@ type Planner interface {
|
||||||
// CreateEval is used to create an evaluation. This should set the
|
// CreateEval is used to create an evaluation. This should set the
|
||||||
// PreviousEval to that of the current evaluation.
|
// PreviousEval to that of the current evaluation.
|
||||||
CreateEval(*structs.Evaluation) error
|
CreateEval(*structs.Evaluation) error
|
||||||
|
|
||||||
|
// ReblockEval takes a blocked evaluation and re-inserts it into the blocked
|
||||||
|
// evaluation tracker. This update occurs only in-memory on the leader. The
|
||||||
|
// evaluation must exist in a blocked state prior to this being called such
|
||||||
|
// that on leader changes, the evaluation will be reblocked properly.
|
||||||
|
ReblockEval(*structs.Evaluation) error
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package scheduler
|
package scheduler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -29,6 +30,10 @@ func (r *RejectPlan) CreateEval(*structs.Evaluation) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *RejectPlan) ReblockEval(*structs.Evaluation) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Harness is a lightweight testing harness for schedulers. It manages a state
|
// Harness is a lightweight testing harness for schedulers. It manages a state
|
||||||
// store copy and provides the planner interface. It can be extended for various
|
// store copy and provides the planner interface. It can be extended for various
|
||||||
// testing uses or for invoking the scheduler without side effects.
|
// testing uses or for invoking the scheduler without side effects.
|
||||||
|
@ -41,6 +46,7 @@ type Harness struct {
|
||||||
Plans []*structs.Plan
|
Plans []*structs.Plan
|
||||||
Evals []*structs.Evaluation
|
Evals []*structs.Evaluation
|
||||||
CreateEvals []*structs.Evaluation
|
CreateEvals []*structs.Evaluation
|
||||||
|
ReblockEvals []*structs.Evaluation
|
||||||
|
|
||||||
nextIndex uint64
|
nextIndex uint64
|
||||||
nextIndexLock sync.Mutex
|
nextIndexLock sync.Mutex
|
||||||
|
@ -138,6 +144,28 @@ func (h *Harness) CreateEval(eval *structs.Evaluation) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *Harness) ReblockEval(eval *structs.Evaluation) error {
|
||||||
|
// Ensure sequential plan application
|
||||||
|
h.planLock.Lock()
|
||||||
|
defer h.planLock.Unlock()
|
||||||
|
|
||||||
|
// Check that the evaluation was already blocked.
|
||||||
|
old, err := h.State.EvalByID(eval.ID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if old == nil {
|
||||||
|
return fmt.Errorf("evaluation does not exist to be reblocked")
|
||||||
|
}
|
||||||
|
if old.Status != structs.EvalStatusBlocked {
|
||||||
|
return fmt.Errorf("evaluation %q is not already in a blocked state", old.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
h.ReblockEvals = append(h.ReblockEvals, eval)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// NextIndex returns the next index
|
// NextIndex returns the next index
|
||||||
func (h *Harness) NextIndex() uint64 {
|
func (h *Harness) NextIndex() uint64 {
|
||||||
h.nextIndexLock.Lock()
|
h.nextIndexLock.Lock()
|
||||||
|
|
Loading…
Reference in New Issue