core: reschedule evicted batch job when resources become available

This PR fixes a bug where an evicted batch job would not be rescheduled
once resources become available.

Closes #9890
This commit is contained in:
Seth Hoenig 2022-06-02 11:42:15 -05:00
parent 0399b7e4c5
commit 0692190e12
3 changed files with 115 additions and 3 deletions

3
.changelog/13205.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
core: Fixed a bug where an evicted batch job would not be rescheduled
```

View File

@ -401,7 +401,8 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time
return
}
// shouldFilter returns whether the alloc should be ignored or considered untainted
// shouldFilter returns whether the alloc should be ignored or considered untainted.
//
// Ignored allocs are filtered out.
// Untainted allocs count against the desired total.
// Filtering logic for batch jobs:
@ -418,11 +419,13 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo
// complete but not failed, they shouldn't be replaced.
if isBatch {
switch alloc.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
case structs.AllocDesiredStatusStop:
if alloc.RanSuccessfully() {
return true, false
}
return false, true
case structs.AllocDesiredStatusEvict:
return false, true
default:
}

View File

@ -2,7 +2,6 @@ package scheduler
import (
"testing"
"time"
"github.com/hashicorp/nomad/ci"
@ -826,3 +825,110 @@ func TestAllocSet_filterByTainted(t *testing.T) {
})
}
}
func TestReconcile_shouldFilter(t *testing.T) {
testCases := []struct {
description string
batch bool
failed bool
desiredStatus string
clientStatus string
untainted bool
ignore bool
}{
{
description: "batch running",
batch: true,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusRunning,
untainted: true,
ignore: false,
},
{
description: "batch stopped success",
batch: true,
failed: false,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusRunning,
untainted: true,
ignore: false,
},
{
description: "batch stopped failed",
batch: true,
failed: true,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "batch evicted",
batch: true,
desiredStatus: structs.AllocDesiredStatusEvict,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "batch failed",
batch: true,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusFailed,
untainted: false,
ignore: false,
},
{
description: "service running",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusRunning,
untainted: false,
ignore: false,
},
{
description: "service stopped",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "service evicted",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusEvict,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "service client complete",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
alloc := &structs.Allocation{
DesiredStatus: tc.desiredStatus,
TaskStates: map[string]*structs.TaskState{"task": {State: structs.TaskStateDead, Failed: tc.failed}},
ClientStatus: tc.clientStatus,
}
untainted, ignore := shouldFilter(alloc, tc.batch)
require.Equal(t, tc.untainted, untainted)
require.Equal(t, tc.ignore, ignore)
})
}
}