scheduler: stopped-yet-running allocs are still running (#10446)

* scheduler: stopped-yet-running allocs are still running

* scheduler: test new stopped-but-running logic

* test: assert nonoverlapping alloc behavior

Also add a simpler Wait test helper to improve line numbers and save few
lines of code.

* docs: tried my best to describe #10446

it's not concise... feedback welcome

* scheduler: fix test that allowed overlapping allocs

* devices: only free devices when ClientStatus is terminal

* test: output nicer failure message if err==nil

Co-authored-by: Mahmood Ali <mahmood@hashicorp.com>
Co-authored-by: Michael Schurter <mschurter@hashicorp.com>
This commit is contained in:
Mahmood Ali 2022-09-13 15:52:47 -04:00 committed by GitHub
parent 984e2cc4f1
commit a9d5e4c510
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 474 additions and 19 deletions

3
.changelog/10446.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
core: prevent new allocations from overlapping execution with stopping allocations
```

View File

@ -19,8 +19,10 @@ import (
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
"github.com/kr/pretty"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/exp/slices"
)
func TestJobEndpoint_Register(t *testing.T) {
@ -109,6 +111,168 @@ func TestJobEndpoint_Register(t *testing.T) {
}
}
// TestJobEndpoint_Register_NonOverlapping asserts that ClientStatus must be
// terminal, not just DesiredStatus, for the resources used by a job to be
// considered free for subsequent placements to use.
//
// See: https://github.com/hashicorp/nomad/issues/10440
func TestJobEndpoint_Register_NonOverlapping(t *testing.T) {
ci.Parallel(t)
s1, cleanupS1 := TestServer(t, func(c *Config) {
})
defer cleanupS1()
state := s1.fsm.State()
// Create a mock node with easy to check resources
node := mock.Node()
node.Resources = nil // Deprecated in 0.9
node.NodeResources.Cpu.CpuShares = 700
must.NoError(t, state.UpsertNode(structs.MsgTypeTestSetup, 1, node))
codec := rpcClient(t, s1)
testutil.WaitForLeader(t, s1.RPC)
// Create the register request
job := mock.Job()
job.TaskGroups[0].Count = 1
req := &structs.JobRegisterRequest{
Job: job.Copy(),
WriteRequest: structs.WriteRequest{
Region: "global",
Namespace: job.Namespace,
},
}
// Fetch the response
var resp structs.JobRegisterResponse
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
must.NonZero(t, resp.Index)
// Assert placement
jobReq := &structs.JobSpecificRequest{
JobID: job.ID,
QueryOptions: structs.QueryOptions{
Region: "global",
Namespace: structs.DefaultNamespace,
},
}
var alloc *structs.AllocListStub
testutil.Wait(t, func() (bool, error) {
resp := structs.JobAllocationsResponse{}
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Allocations", jobReq, &resp))
if n := len(resp.Allocations); n != 1 {
return false, fmt.Errorf("expected 1 allocation but found %d:\n%v", n, resp.Allocations)
}
alloc = resp.Allocations[0]
return true, nil
})
must.Eq(t, alloc.NodeID, node.ID)
must.Eq(t, alloc.DesiredStatus, structs.AllocDesiredStatusRun)
must.Eq(t, alloc.ClientStatus, structs.AllocClientStatusPending)
// Stop
stopReq := &structs.JobDeregisterRequest{
JobID: job.ID,
Purge: false,
WriteRequest: req.WriteRequest,
}
var stopResp structs.JobDeregisterResponse
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Deregister", stopReq, &stopResp))
// Assert new register blocked
req.Job = job.Copy()
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
must.NonZero(t, resp.Index)
blockedEval := ""
testutil.Wait(t, func() (bool, error) {
// Assert no new allocs
allocResp := structs.JobAllocationsResponse{}
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Allocations", jobReq, &allocResp))
if n := len(allocResp.Allocations); n != 1 {
return false, fmt.Errorf("expected 1 allocation but found %d:\n%v", n, allocResp.Allocations)
}
if alloc.ID != allocResp.Allocations[0].ID {
return false, fmt.Errorf("unexpected change in alloc: %#v", *allocResp.Allocations[0])
}
eval, err := state.EvalByID(nil, resp.EvalID)
must.NoError(t, err)
if eval == nil {
return false, fmt.Errorf("eval not applied: %s", resp.EvalID)
}
if eval.Status != structs.EvalStatusComplete {
return false, fmt.Errorf("expected eval to be complete but found: %s", eval.Status)
}
if eval.BlockedEval == "" {
return false, fmt.Errorf("expected a blocked eval to be created")
}
blockedEval = eval.BlockedEval
return true, nil
})
// Set ClientStatus=complete like a client would
stoppedAlloc := &structs.Allocation{
ID: alloc.ID,
NodeID: alloc.NodeID,
TaskStates: map[string]*structs.TaskState{
"web": &structs.TaskState{
State: structs.TaskStateDead,
},
},
ClientStatus: structs.AllocClientStatusComplete,
DeploymentStatus: nil, // should not have an impact
NetworkStatus: nil, // should not have an impact
}
upReq := &structs.AllocUpdateRequest{
Alloc: []*structs.Allocation{stoppedAlloc},
WriteRequest: req.WriteRequest,
}
var upResp structs.GenericResponse
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", upReq, &upResp))
// Assert newer register's eval unblocked
testutil.Wait(t, func() (bool, error) {
eval, err := state.EvalByID(nil, blockedEval)
must.NoError(t, err)
must.NotNil(t, eval)
if eval.Status != structs.EvalStatusComplete {
return false, fmt.Errorf("expected blocked eval to be complete but found: %s", eval.Status)
}
return true, nil
})
// Assert new alloc placed
testutil.Wait(t, func() (bool, error) {
// Assert no new allocs
allocResp := structs.JobAllocationsResponse{}
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Allocations", jobReq, &allocResp))
if n := len(allocResp.Allocations); n != 2 {
return false, fmt.Errorf("expected 2 allocs but found %d:\n%v", n, allocResp.Allocations)
}
slices.SortFunc(allocResp.Allocations, func(a, b *structs.AllocListStub) bool {
return a.CreateIndex < b.CreateIndex
})
if alloc.ID != allocResp.Allocations[0].ID {
return false, fmt.Errorf("unexpected change in alloc: %#v", *allocResp.Allocations[0])
}
if cs := allocResp.Allocations[0].ClientStatus; cs != structs.AllocClientStatusComplete {
return false, fmt.Errorf("expected old alloc to be complete but found: %s", cs)
}
if cs := allocResp.Allocations[1].ClientStatus; cs != structs.AllocClientStatusPending {
return false, fmt.Errorf("expected new alloc to be pending but found: %s", cs)
}
return true, nil
})
}
func TestJobEndpoint_Register_PreserveCounts(t *testing.T) {
ci.Parallel(t)
require := require.New(t)

View File

@ -61,7 +61,7 @@ func NewDeviceAccounter(n *Node) *DeviceAccounter {
func (d *DeviceAccounter) AddAllocs(allocs []*Allocation) (collision bool) {
for _, a := range allocs {
// Filter any terminal allocation
if a.TerminalStatus() {
if a.ClientTerminalStatus() {
continue
}

View File

@ -6,6 +6,7 @@ import (
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper/uuid"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
)
@ -162,6 +163,35 @@ func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) {
require.True(d.AddAllocs(allocs))
}
// Assert that devices are not freed when an alloc's ServerTerminalStatus is
// true, but only when ClientTerminalStatus is true.
func TestDeviceAccounter_AddAllocs_TerminalStatus(t *testing.T) {
ci.Parallel(t)
n := devNode()
d := NewDeviceAccounter(n)
// Create two allocations, both with the same device. First is being told to
// stop but has not stopped yet.
a1, a2 := nvidiaAlloc(), nvidiaAlloc()
a1.DesiredStatus = AllocDesiredStatusStop
a1.ClientStatus = AllocClientStatusRunning
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
allocs := []*Allocation{a1, a2}
// Since a1 has not stopped on the client, its device is still in use
must.True(t, d.AddAllocs(allocs))
// Assert that stop a1 on the client frees the device for use by a2
a1.ClientStatus = AllocClientStatusComplete
d = NewDeviceAccounter(n)
must.False(t, d.AddAllocs(allocs))
}
// Make sure that the device allocator works even if the node has no devices
func TestDeviceAccounter_AddReserved_NoDeviceNode(t *testing.T) {
ci.Parallel(t)

View File

@ -173,7 +173,7 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
// For each alloc, add the resources
for _, alloc := range allocs {
// Do not consider the resource impact of terminal allocations
if alloc.TerminalStatus() {
if alloc.ClientTerminalStatus() {
continue
}

View File

@ -267,6 +267,7 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) {
// Should fit second allocation since it is terminal
a2 := a1.Copy()
a2.DesiredStatus = AllocDesiredStatusStop
a2.ClientStatus = AllocClientStatusComplete
fit, _, used, err = AllocsFit(n, []*Allocation{a1, a2}, nil, false)
require.NoError(err)
require.True(fit)
@ -494,6 +495,7 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) {
// Should fit second allocation since it is terminal
a2 := a1.Copy()
a2.DesiredStatus = AllocDesiredStatusStop
a2.ClientStatus = AllocClientStatusComplete
fit, dim, used, err := AllocsFit(n, []*Allocation{a1, a2}, nil, false)
require.NoError(err)
require.True(fit, dim)
@ -501,6 +503,176 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) {
require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
}
// TestAllocsFit_ClientTerminalAlloc asserts that allocs which have a terminal
// ClientStatus *do not* have their resources counted as in-use.
func TestAllocsFit_ClientTerminalAlloc(t *testing.T) {
ci.Parallel(t)
n := &Node{
ID: "test-node",
NodeResources: &NodeResources{
Cpu: NodeCpuResources{
CpuShares: 2000,
},
Memory: NodeMemoryResources{
MemoryMB: 2048,
},
Disk: NodeDiskResources{
DiskMB: 10000,
},
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "10.0.0.0/8",
IP: "10.0.0.1",
MBits: 100,
},
},
},
ReservedResources: &NodeReservedResources{
Cpu: NodeReservedCpuResources{
CpuShares: 1000,
},
Memory: NodeReservedMemoryResources{
MemoryMB: 1024,
},
Disk: NodeReservedDiskResources{
DiskMB: 5000,
},
Networks: NodeReservedNetworkResources{
ReservedHostPorts: "80",
},
},
}
liveAlloc := &Allocation{
ID: "test-alloc-live",
ClientStatus: AllocClientStatusPending,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
Cpu: AllocatedCpuResources{
CpuShares: 1000,
},
Memory: AllocatedMemoryResources{
MemoryMB: 1024,
},
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "10.0.0.1",
MBits: 50,
ReservedPorts: []Port{{"main", 8000, 80, ""}},
},
},
},
},
Shared: AllocatedSharedResources{
DiskMB: 5000,
},
},
}
deadAlloc := liveAlloc.Copy()
deadAlloc.ID = "test-alloc-dead"
deadAlloc.ClientStatus = AllocClientStatusFailed
deadAlloc.DesiredStatus = AllocDesiredStatusRun
// *Should* fit both allocations since deadAlloc is not running on the
// client
fit, _, used, err := AllocsFit(n, []*Allocation{liveAlloc, deadAlloc}, nil, false)
require.NoError(t, err)
require.True(t, fit)
require.EqualValues(t, 1000, used.Flattened.Cpu.CpuShares)
require.EqualValues(t, 1024, used.Flattened.Memory.MemoryMB)
}
// TestAllocsFit_ServerTerminalAlloc asserts that allocs which have a terminal
// DesiredStatus but are still running on clients *do* have their resources
// counted as in-use.
func TestAllocsFit_ServerTerminalAlloc(t *testing.T) {
ci.Parallel(t)
n := &Node{
ID: "test-node",
NodeResources: &NodeResources{
Cpu: NodeCpuResources{
CpuShares: 2000,
},
Memory: NodeMemoryResources{
MemoryMB: 2048,
},
Disk: NodeDiskResources{
DiskMB: 10000,
},
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "10.0.0.0/8",
IP: "10.0.0.1",
MBits: 100,
},
},
},
ReservedResources: &NodeReservedResources{
Cpu: NodeReservedCpuResources{
CpuShares: 1000,
},
Memory: NodeReservedMemoryResources{
MemoryMB: 1024,
},
Disk: NodeReservedDiskResources{
DiskMB: 5000,
},
Networks: NodeReservedNetworkResources{
ReservedHostPorts: "80",
},
},
}
liveAlloc := &Allocation{
ID: "test-alloc-live",
ClientStatus: AllocClientStatusPending,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
Cpu: AllocatedCpuResources{
CpuShares: 1000,
},
Memory: AllocatedMemoryResources{
MemoryMB: 1024,
},
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "10.0.0.1",
MBits: 50,
ReservedPorts: []Port{{"main", 8000, 80, ""}},
},
},
},
},
Shared: AllocatedSharedResources{
DiskMB: 5000,
},
},
}
deadAlloc := liveAlloc.Copy()
deadAlloc.ID = "test-alloc-dead"
deadAlloc.ClientStatus = AllocClientStatusRunning
deadAlloc.DesiredStatus = AllocDesiredStatusStop
// Should *not* fit both allocations since deadAlloc is still running
fit, _, used, err := AllocsFit(n, []*Allocation{liveAlloc, deadAlloc}, nil, false)
require.NoError(t, err)
require.False(t, fit)
require.EqualValues(t, 2000, used.Flattened.Cpu.CpuShares)
require.EqualValues(t, 2048, used.Flattened.Memory.MemoryMB)
}
// Tests that AllocsFit detects device collisions
func TestAllocsFit_Devices(t *testing.T) {
ci.Parallel(t)

View File

@ -347,7 +347,7 @@ func (idx *NetworkIndex) SetNode(node *Node) error {
func (idx *NetworkIndex) AddAllocs(allocs []*Allocation) (collide bool, reason string) {
for _, alloc := range allocs {
// Do not consider the resource impact of terminal allocations
if alloc.TerminalStatus() {
if alloc.ClientTerminalStatus() {
continue
}

View File

@ -7,6 +7,7 @@ import (
"testing"
"github.com/hashicorp/nomad/ci"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@ -201,6 +202,8 @@ func TestNetworkIndex_AddAllocs(t *testing.T) {
idx := NewNetworkIndex()
allocs := []*Allocation{
{
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
@ -217,6 +220,8 @@ func TestNetworkIndex_AddAllocs(t *testing.T) {
},
},
{
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
@ -232,24 +237,56 @@ func TestNetworkIndex_AddAllocs(t *testing.T) {
},
},
},
{
// Allocations running on clients should have their
// ports counted even if their DesiredStatus=stop
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusStop,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10001, 0, ""}},
},
},
},
},
},
},
{
// Allocations *not* running on clients should *not*
// have their ports counted even if their
// DesiredStatus=run
ClientStatus: AllocClientStatusFailed,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10001, 0, ""}},
},
},
},
},
},
},
}
collide, reason := idx.AddAllocs(allocs)
if collide || reason != "" {
t.Fatalf("bad")
}
assert.False(t, collide)
assert.Empty(t, reason)
if idx.UsedBandwidth["eth0"] != 70 {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(8000) {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(9000) {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(10000) {
t.Fatalf("Bad")
}
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(8000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(9000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(10000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(10001))
}
func TestNetworkIndex_AddReserved(t *testing.T) {

View File

@ -173,7 +173,7 @@ func (e *EvalContext) Reset() {
func (e *EvalContext) ProposedAllocs(nodeID string) ([]*structs.Allocation, error) {
// Get the existing allocations that are non-terminal
ws := memdb.NewWatchSet()
proposed, err := e.state.AllocsByNodeTerminal(ws, nodeID, false)
proposed, err := e.state.AllocsByNode(ws, nodeID)
if err != nil {
return nil, err
}
@ -194,6 +194,10 @@ func (e *EvalContext) ProposedAllocs(nodeID string) ([]*structs.Allocation, erro
// update occurs, we do not double count and we override the old allocation.
proposedIDs := make(map[string]*structs.Allocation, len(proposed))
for _, alloc := range proposed {
if alloc.ClientTerminalStatus() {
continue
}
proposedIDs[alloc.ID] = alloc
}
for _, alloc := range e.plan.NodeAllocation[nodeID] {

View File

@ -1596,6 +1596,7 @@ func TestServiceSched_JobModify(t *testing.T) {
alloc.NodeID = nodes[i].ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.DesiredStatus = structs.AllocDesiredStatusStop
alloc.ClientStatus = structs.AllocClientStatusFailed // #10446
terminal = append(terminal, alloc)
}
require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), terminal))

View File

@ -14,6 +14,28 @@ import (
type testFn func() (bool, error)
type errorFn func(error)
func Wait(t *testing.T, test testFn) {
t.Helper()
retries := 500 * TestMultiplier()
for retries > 0 {
time.Sleep(10 * time.Millisecond)
retries--
success, err := test()
if success {
return
}
if retries == 0 {
if err == nil {
t.Fatalf("timeout waiting for test function to succeed (you should probably return a helpful error instead of nil!)")
} else {
t.Fatalf("timeout: %v", err)
}
}
}
}
func WaitForResult(test testFn, error errorFn) {
WaitForResultRetries(500*TestMultiplier(), test, error)
}

View File

@ -30,6 +30,25 @@ Audit Log filtering in previous versions of Nomad handled `stages` and
would be filtered. As of 1.4.0, `stages` and `operations` are treated as `AND
filters`. Logs will only be filtered if all filter conditions match.
#### Prevent Overlapping New Allocations with Stopping Allocations
Prior to Nomad 1.4.0 the scheduler would consider the resources used by
allocations that are in the process of stopping to be free for new allocations
to use. This could cause newer allocations to crash when they try to use TCP
ports or memory used by an allocation in the process of stopping. The new and
stopping [allocations would "overlap" improperly.][alloc_overlap]
[Nomad 1.4.0 fixes this behavior][gh_10446] so that an allocation's resources
are only considered free for reuse once the client node the allocation was
running on reports it has stopped. Technically speaking: only once the
`Allocation.ClientStatus` has reached a terminal state (`complete`, `failed`,
or `lost`).
Despite this being a bug fix, it is considered a significant enough change in
behavior to reserve for a major Nomad release and *not* be backported. Please
report any negative side effects encountered as [new
issues.][gh_issue]
## Nomad 1.3.3
Environments that don't support the use of [`uid`][template_uid] and
@ -1471,3 +1490,6 @@ deleted and then Nomad 0.3.0 can be launched.
[consul_acl]: https://github.com/hashicorp/consul/issues/7414
[kill_timeout]: /docs/job-specification/task#kill_timeout
[max_kill_timeout]: /docs/configuration/client#max_kill_timeout
[alloc_overlap]: https://github.com/hashicorp/nomad/issues/10440
[gh_10446]: https://github.com/hashicorp/nomad/pull/10446#issuecomment-1224833906
[gh_issue]: https://github.com/hashicorp/nomad/issues/new/choose