5a9abdc469
If an allocation is slow to stop because of `kill_timeout` or `shutdown_delay`, the node drain is marked as complete prematurely, even though drain monitoring will continue to report allocation migrations. This impacts the UI or API clients that monitor node draining to shut down nodes. This changeset updates the behavior to wait until the client status of all drained allocs are terminal before marking the node as done draining.
778 lines
23 KiB
Go
778 lines
23 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package drainer
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/shoenig/test"
|
|
"github.com/shoenig/test/must"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/time/rate"
|
|
|
|
"github.com/hashicorp/nomad/ci"
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) {
|
|
n1 := mock.Node()
|
|
n1.Name = "draining"
|
|
n1.DrainStrategy = &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: time.Minute,
|
|
},
|
|
ForceDeadline: time.Now().Add(time.Minute),
|
|
}
|
|
require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 100, n1))
|
|
|
|
// Create a non-draining node
|
|
n2 := mock.Node()
|
|
n2.Name = "running"
|
|
require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 101, n2))
|
|
return n1, n2
|
|
}
|
|
|
|
func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) {
|
|
t.Helper()
|
|
|
|
limiter := rate.NewLimiter(100.0, 100)
|
|
logger := testlog.HCLogger(t)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
w := NewDrainingJobWatcher(ctx, limiter, state, logger)
|
|
return w, cancel
|
|
}
|
|
|
|
// TestDrainingJobWatcher_Interface is a compile-time assertion that we
|
|
// implement the intended interface.
|
|
func TestDrainingJobWatcher_Interface(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t))
|
|
cancel()
|
|
var _ DrainingJobWatcher = w
|
|
}
|
|
|
|
// asertJobWatcherOps asserts a certain number of allocs are drained and/or
|
|
// migrated by the job watcher.
|
|
func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) (
|
|
*DrainRequest, []*structs.Allocation) {
|
|
t.Helper()
|
|
var (
|
|
drains *DrainRequest
|
|
migrations []*structs.Allocation
|
|
drainsChecked, migrationsChecked bool
|
|
)
|
|
for {
|
|
select {
|
|
case drains = <-jw.Drain():
|
|
ids := make([]string, len(drains.Allocs))
|
|
for i, a := range drains.Allocs {
|
|
ids[i] = a.JobID[:6] + ":" + a.ID[:6]
|
|
}
|
|
t.Logf("draining %d allocs: %v", len(ids), ids)
|
|
require.False(t, drainsChecked, "drains already received")
|
|
drainsChecked = true
|
|
require.Lenf(t, drains.Allocs, drained,
|
|
"expected %d drains but found %d", drained, len(drains.Allocs))
|
|
case migrations = <-jw.Migrated():
|
|
ids := make([]string, len(migrations))
|
|
for i, a := range migrations {
|
|
ids[i] = a.JobID[:6] + ":" + a.ID[:6]
|
|
}
|
|
t.Logf("migrating %d allocs: %v", len(ids), ids)
|
|
require.False(t, migrationsChecked, "migrations already received")
|
|
migrationsChecked = true
|
|
require.Lenf(t, migrations, migrated,
|
|
"expected %d migrations but found %d", migrated, len(migrations))
|
|
case <-time.After(10 * time.Millisecond):
|
|
if !drainsChecked && drained > 0 {
|
|
t.Fatalf("expected %d drains but none happened", drained)
|
|
}
|
|
if !migrationsChecked && migrated > 0 {
|
|
t.Fatalf("expected %d migrations but none happened", migrated)
|
|
}
|
|
return drains, migrations
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches
|
|
// allocation changes from multiple jobs.
|
|
func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
store := state.TestStateStore(t)
|
|
jobWatcher, cancelWatcher := testDrainingJobWatcher(t, store)
|
|
defer cancelWatcher()
|
|
drainingNode, runningNode := testNodes(t, store)
|
|
|
|
var index uint64 = 101
|
|
count := 8
|
|
|
|
newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation {
|
|
a := mock.Alloc()
|
|
a.JobID = job.ID
|
|
a.Job = job
|
|
a.TaskGroup = job.TaskGroups[0].Name
|
|
a.NodeID = node.ID
|
|
return a
|
|
}
|
|
|
|
// 2 jobs with count 10, max parallel 3
|
|
jnss := make([]structs.NamespacedID, 2)
|
|
jobs := make([]*structs.Job, 2)
|
|
for i := 0; i < 2; i++ {
|
|
job := mock.Job()
|
|
jobs[i] = job
|
|
jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
|
|
job.TaskGroups[0].Migrate.MaxParallel = 3
|
|
job.TaskGroups[0].Count = count
|
|
must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, index, nil, job))
|
|
index++
|
|
|
|
var allocs []*structs.Allocation
|
|
for i := 0; i < count; i++ {
|
|
a := newAlloc(drainingNode, job)
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
}
|
|
allocs = append(allocs, a)
|
|
}
|
|
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, allocs))
|
|
index++
|
|
|
|
}
|
|
|
|
// Only register jobs with watcher after creating all data models as
|
|
// once the watcher starts we need to track the index carefully for
|
|
// updating the batch future
|
|
jobWatcher.RegisterJobs(jnss)
|
|
|
|
// Expect a first batch of MaxParallel allocs from each job
|
|
drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0)
|
|
|
|
// Fake migrating the drained allocs by starting new ones and stopping
|
|
// the old ones
|
|
drainedAllocs := make([]*structs.Allocation, len(drains.Allocs))
|
|
for i, a := range drains.Allocs {
|
|
a.DesiredTransition.Migrate = pointer.Of(true)
|
|
|
|
// create a copy so we can reuse this slice
|
|
drainedAllocs[i] = a.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
|
|
drains.Resp.Respond(index, nil)
|
|
index++
|
|
|
|
// Just setting ShouldMigrate should not cause any further drains
|
|
assertJobWatcherOps(t, jobWatcher, 0, 0)
|
|
|
|
// Proceed our fake migration along by creating new allocs and stopping
|
|
// old ones
|
|
replacements := make([]*structs.Allocation, len(drainedAllocs))
|
|
updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2)
|
|
for i, a := range drainedAllocs {
|
|
// Stop drained allocs
|
|
a.DesiredTransition.Migrate = nil
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
// Create a replacement
|
|
replacement := mock.Alloc()
|
|
replacement.JobID = a.Job.ID
|
|
replacement.Job = a.Job
|
|
replacement.TaskGroup = a.TaskGroup
|
|
replacement.NodeID = runningNode.ID
|
|
// start in pending state with no health status
|
|
|
|
updates = append(updates, a, replacement)
|
|
replacements[i] = replacement.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
|
|
index++
|
|
|
|
// The drained allocs stopping cause migrations but no new drains
|
|
// because the replacements have not started
|
|
assertJobWatcherOps(t, jobWatcher, 0, 0)
|
|
|
|
// Client sends stop on these allocs
|
|
completeAllocs := make([]*structs.Allocation, len(drainedAllocs))
|
|
for i, a := range drainedAllocs {
|
|
a = a.Copy()
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
completeAllocs[i] = a
|
|
}
|
|
must.NoError(t, store.UpdateAllocsFromClient(structs.MsgTypeTestSetup, index, completeAllocs))
|
|
index++
|
|
|
|
// The drained allocs stopping cause migrations but no new drains
|
|
// because the replacements have not started
|
|
assertJobWatcherOps(t, jobWatcher, 0, 6)
|
|
|
|
// Finally kickoff further drain activity by "starting" replacements
|
|
for _, a := range replacements {
|
|
a.ClientStatus = structs.AllocClientStatusRunning
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
}
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
|
|
index++
|
|
|
|
must.MapNotEmpty(t, jobWatcher.drainingJobs())
|
|
|
|
// 6 new drains
|
|
drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0)
|
|
|
|
// Fake migrations once more to finish the drain
|
|
drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
|
|
for i, a := range drains.Allocs {
|
|
a.DesiredTransition.Migrate = pointer.Of(true)
|
|
|
|
// create a copy so we can reuse this slice
|
|
drainedAllocs[i] = a.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
|
|
drains.Resp.Respond(index, nil)
|
|
index++
|
|
|
|
assertJobWatcherOps(t, jobWatcher, 0, 0)
|
|
|
|
replacements = make([]*structs.Allocation, len(drainedAllocs))
|
|
updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
|
|
for i, a := range drainedAllocs {
|
|
a.DesiredTransition.Migrate = nil
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
|
|
replacement := newAlloc(runningNode, a.Job)
|
|
updates = append(updates, a, replacement)
|
|
replacements[i] = replacement.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
|
|
index++
|
|
|
|
assertJobWatcherOps(t, jobWatcher, 0, 6)
|
|
|
|
for _, a := range replacements {
|
|
a.ClientStatus = structs.AllocClientStatusRunning
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
}
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
|
|
index++
|
|
|
|
must.MapNotEmpty(t, jobWatcher.drainingJobs())
|
|
|
|
// Final 4 new drains
|
|
drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0)
|
|
|
|
// Fake migrations once more to finish the drain
|
|
drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
|
|
for i, a := range drains.Allocs {
|
|
a.DesiredTransition.Migrate = pointer.Of(true)
|
|
|
|
// create a copy so we can reuse this slice
|
|
drainedAllocs[i] = a.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
|
|
drains.Resp.Respond(index, nil)
|
|
index++
|
|
|
|
assertJobWatcherOps(t, jobWatcher, 0, 0)
|
|
|
|
replacements = make([]*structs.Allocation, len(drainedAllocs))
|
|
updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
|
|
for i, a := range drainedAllocs {
|
|
a.DesiredTransition.Migrate = nil
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
|
|
replacement := newAlloc(runningNode, a.Job)
|
|
updates = append(updates, a, replacement)
|
|
replacements[i] = replacement.Copy()
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
|
|
index++
|
|
|
|
assertJobWatcherOps(t, jobWatcher, 0, 4)
|
|
|
|
for _, a := range replacements {
|
|
a.ClientStatus = structs.AllocClientStatusRunning
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
}
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
|
|
|
|
// No jobs should be left!
|
|
must.MapEmpty(t, jobWatcher.drainingJobs())
|
|
}
|
|
|
|
// TestDrainingJobWatcher_HandleTaskGroup tests that the watcher handles
|
|
// allocation updates as expected.
|
|
func TestDrainingJobWatcher_HandleTaskGroup(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
testCases := []struct {
|
|
name string
|
|
batch bool // use a batch job
|
|
allocCount int // number of allocs in test (defaults to 10)
|
|
maxParallel int // max_parallel (defaults to 1)
|
|
|
|
// addAllocFn will be called allocCount times to create test allocs,
|
|
// and the allocs default to be healthy on the draining node
|
|
addAllocFn func(idx int, a *structs.Allocation, drainingID, runningID string)
|
|
|
|
expectDrained int
|
|
expectMigrated int
|
|
expectDone bool
|
|
}{
|
|
{
|
|
// all allocs on draining node, should respect max_parallel=1
|
|
name: "drain-respects-max-parallel-1",
|
|
expectDrained: 1,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
},
|
|
{
|
|
// allocs on a non-draining node, should not be drained
|
|
name: "allocs-on-non-draining-node-should-not-drain",
|
|
expectDrained: 0,
|
|
expectMigrated: 0,
|
|
expectDone: true,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
a.NodeID = runningID
|
|
},
|
|
},
|
|
{
|
|
// even unhealthy allocs on a non-draining node should not be drained
|
|
name: "unhealthy-allocs-on-non-draining-node-should-not-drain",
|
|
expectDrained: 0,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
if i%2 == 0 {
|
|
a.NodeID = runningID
|
|
a.DeploymentStatus = nil
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// only the alloc on draining node should be drained
|
|
name: "healthy-alloc-draining-node-should-drain",
|
|
expectDrained: 1,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
if i != 0 {
|
|
a.NodeID = runningID
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// alloc that's still draining doesn't produce more result updates
|
|
name: "still-draining-alloc-no-new-updates",
|
|
expectDrained: 0,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
if i == 0 {
|
|
a.DesiredTransition.Migrate = pointer.Of(true)
|
|
return
|
|
}
|
|
a.NodeID = runningID
|
|
},
|
|
},
|
|
{
|
|
// alloc that's finished draining gets marked as migrated
|
|
name: "client-terminal-alloc-drain-should-be-finished",
|
|
expectDrained: 0,
|
|
expectMigrated: 1,
|
|
expectDone: true,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
if i == 0 {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
return
|
|
}
|
|
a.NodeID = runningID
|
|
},
|
|
},
|
|
{
|
|
// batch alloc that's finished draining gets marked as migrated
|
|
name: "client-terminal-batch-alloc-drain-should-be-finished",
|
|
batch: true,
|
|
expectDrained: 0,
|
|
expectMigrated: 1,
|
|
expectDone: true,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
if i == 0 {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
return
|
|
}
|
|
a.NodeID = runningID
|
|
},
|
|
},
|
|
{
|
|
// all allocs are client-terminal, so nothing left to drain
|
|
name: "all-client-terminal-drain-should-be-finished",
|
|
expectDrained: 0,
|
|
expectMigrated: 10,
|
|
expectDone: true,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
},
|
|
},
|
|
{
|
|
// all allocs are terminal, but only half are client-terminal
|
|
name: "half-client-terminal-drain-should-not-be-finished",
|
|
expectDrained: 0,
|
|
expectMigrated: 5,
|
|
expectDone: false,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
if i%2 == 0 {
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// All allocs are terminal, nothing to be drained
|
|
name: "all-terminal-batch",
|
|
batch: true,
|
|
expectDrained: 0,
|
|
expectMigrated: 10,
|
|
expectDone: true,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
},
|
|
},
|
|
{
|
|
// with max_parallel=10, all allocs can be drained at once
|
|
name: "drain-respects-max-parallel-all-at-once",
|
|
expectDrained: 10,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
maxParallel: 10,
|
|
},
|
|
{
|
|
// with max_parallel=2, up to 2 allocs can be drained at a time
|
|
name: "drain-respects-max-parallel-2",
|
|
expectDrained: 2,
|
|
expectMigrated: 0,
|
|
expectDone: false,
|
|
maxParallel: 2,
|
|
},
|
|
{
|
|
// with max_parallel=2, up to 2 allocs can be drained at a time but
|
|
// we haven't yet informed the drainer that 1 has completed
|
|
// migrating
|
|
name: "notify-migrated-1-on-new-1-drained-1-draining",
|
|
expectDrained: 1,
|
|
expectMigrated: 1,
|
|
maxParallel: 2,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
switch i {
|
|
case 0:
|
|
// One alloc on running node
|
|
a.NodeID = runningID
|
|
case 1:
|
|
// One alloc already migrated
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// with max_parallel=2, up to 2 allocs can be drained at a time but
|
|
// we haven't yet informed the drainer that 1 has completed
|
|
// migrating
|
|
name: "notify-migrated-8-on-new-1-drained-1-draining",
|
|
expectDrained: 1,
|
|
expectMigrated: 1,
|
|
maxParallel: 2,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
switch i {
|
|
case 0, 1, 2, 3, 4, 5, 6, 7:
|
|
a.NodeID = runningID
|
|
case 8:
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// 5 on new node, two drained, and three draining
|
|
// with max_parallel=5, up to 5 allocs can be drained at a time but
|
|
// we haven't yet informed the drainer that 2 have completed
|
|
// migrating
|
|
name: "notify-migrated-5-on-new-2-drained-3-draining",
|
|
expectDrained: 3,
|
|
expectMigrated: 2,
|
|
maxParallel: 5,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
switch i {
|
|
case 0, 1, 2, 3, 4:
|
|
a.NodeID = runningID
|
|
case 8, 9:
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// half the allocs have been moved to the new node but 1 doesn't
|
|
// have health set yet, so we should have MaxParallel - 1 in flight
|
|
name: "pending-health-blocks",
|
|
expectDrained: 1,
|
|
expectMigrated: 1,
|
|
maxParallel: 3,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
switch i {
|
|
case 0:
|
|
// Deployment status UNset for 1 on new node
|
|
a.NodeID = runningID
|
|
a.DeploymentStatus = nil
|
|
case 1, 2, 3, 4:
|
|
// Deployment status set for 4 on new node
|
|
a.NodeID = runningID
|
|
case 9:
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
{
|
|
// half the allocs have been moved to the new node but 2 don't have
|
|
// health set yet, so we should have MaxParallel - 2 in flight
|
|
name: "pending-health-blocks-higher-max",
|
|
expectDrained: 2,
|
|
expectMigrated: 1,
|
|
maxParallel: 5,
|
|
addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
|
|
switch i {
|
|
case 0, 1:
|
|
// Deployment status UNset for 2 on new node
|
|
a.NodeID = runningID
|
|
a.DeploymentStatus = nil
|
|
case 2, 3, 4:
|
|
// Deployment status set for 3 on new node
|
|
a.NodeID = runningID
|
|
case 9:
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
}
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
// Create nodes
|
|
store := state.TestStateStore(t)
|
|
drainingNode, runningNode := testNodes(t, store)
|
|
|
|
job := mock.Job()
|
|
if tc.batch {
|
|
job = mock.BatchJob()
|
|
}
|
|
job.TaskGroups[0].Count = 10
|
|
if tc.allocCount > 0 {
|
|
job.TaskGroups[0].Count = tc.allocCount
|
|
}
|
|
if tc.maxParallel > 0 {
|
|
job.TaskGroups[0].Migrate.MaxParallel = tc.maxParallel
|
|
}
|
|
must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, 102, nil, job))
|
|
|
|
var allocs []*structs.Allocation
|
|
for i := 0; i < 10; i++ {
|
|
a := mock.Alloc()
|
|
if tc.batch {
|
|
a = mock.BatchAlloc()
|
|
}
|
|
a.JobID = job.ID
|
|
a.Job = job
|
|
a.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
// Default to being healthy on the draining node
|
|
a.NodeID = drainingNode.ID
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
}
|
|
if tc.addAllocFn != nil {
|
|
tc.addAllocFn(i, a, drainingNode.ID, runningNode.ID)
|
|
}
|
|
allocs = append(allocs, a)
|
|
}
|
|
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, 103, allocs))
|
|
snap, err := store.Snapshot()
|
|
must.NoError(t, err)
|
|
|
|
res := newJobResult()
|
|
must.NoError(t, handleTaskGroup(snap, tc.batch, job.TaskGroups[0], allocs, 102, res))
|
|
test.Len(t, tc.expectDrained, res.drain, test.Sprint("expected drained allocs"))
|
|
test.Len(t, tc.expectMigrated, res.migrated, test.Sprint("expected migrated allocs"))
|
|
test.Eq(t, tc.expectDone, res.done)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestHandleTaskGroup_Migrations(t *testing.T) {
|
|
ci.Parallel(t)
|
|
require := require.New(t)
|
|
|
|
// Create a draining node
|
|
state := state.TestStateStore(t)
|
|
n := mock.Node()
|
|
n.DrainStrategy = &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 5 * time.Minute,
|
|
},
|
|
ForceDeadline: time.Now().Add(1 * time.Minute),
|
|
}
|
|
require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
|
|
|
|
job := mock.Job()
|
|
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, nil, job))
|
|
|
|
// Create 10 done allocs
|
|
var allocs []*structs.Allocation
|
|
for i := 0; i < 10; i++ {
|
|
a := mock.Alloc()
|
|
a.Job = job
|
|
a.TaskGroup = job.TaskGroups[0].Name
|
|
a.NodeID = n.ID
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(false),
|
|
}
|
|
|
|
if i%2 == 0 {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
} else {
|
|
a.ClientStatus = structs.AllocClientStatusFailed
|
|
}
|
|
allocs = append(allocs, a)
|
|
}
|
|
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
|
|
|
|
snap, err := state.Snapshot()
|
|
require.Nil(err)
|
|
|
|
// Handle before and after indexes as both service and batch
|
|
res := newJobResult()
|
|
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
|
|
require.Empty(res.drain)
|
|
require.Len(res.migrated, 10)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
|
|
require.Empty(res.drain)
|
|
require.Len(res.migrated, 10)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
|
|
require.Empty(res.drain)
|
|
require.Empty(res.migrated)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
|
|
require.Empty(res.drain)
|
|
require.Empty(res.migrated)
|
|
require.True(res.done)
|
|
}
|
|
|
|
// This test asserts that handle task group works when an allocation is on a
|
|
// garbage collected node
|
|
func TestHandleTaskGroup_GarbageCollectedNode(t *testing.T) {
|
|
ci.Parallel(t)
|
|
require := require.New(t)
|
|
|
|
// Create a draining node
|
|
state := state.TestStateStore(t)
|
|
n := mock.Node()
|
|
n.DrainStrategy = &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 5 * time.Minute,
|
|
},
|
|
ForceDeadline: time.Now().Add(1 * time.Minute),
|
|
}
|
|
require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
|
|
|
|
job := mock.Job()
|
|
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, nil, job))
|
|
|
|
// Create 10 done allocs
|
|
var allocs []*structs.Allocation
|
|
for i := 0; i < 10; i++ {
|
|
a := mock.Alloc()
|
|
a.Job = job
|
|
a.TaskGroup = job.TaskGroups[0].Name
|
|
a.NodeID = n.ID
|
|
a.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(false),
|
|
}
|
|
|
|
if i%2 == 0 {
|
|
a.DesiredStatus = structs.AllocDesiredStatusStop
|
|
a.ClientStatus = structs.AllocClientStatusComplete
|
|
} else {
|
|
a.ClientStatus = structs.AllocClientStatusFailed
|
|
}
|
|
allocs = append(allocs, a)
|
|
}
|
|
|
|
// Make the first one be on a GC'd node
|
|
allocs[0].NodeID = uuid.Generate()
|
|
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
|
|
|
|
snap, err := state.Snapshot()
|
|
require.Nil(err)
|
|
|
|
// Handle before and after indexes as both service and batch
|
|
res := newJobResult()
|
|
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
|
|
require.Empty(res.drain)
|
|
require.Len(res.migrated, 9)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
|
|
require.Empty(res.drain)
|
|
require.Len(res.migrated, 9)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
|
|
require.Empty(res.drain)
|
|
require.Empty(res.migrated)
|
|
require.True(res.done)
|
|
|
|
res = newJobResult()
|
|
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
|
|
require.Empty(res.drain)
|
|
require.Empty(res.migrated)
|
|
require.True(res.done)
|
|
}
|