open-nomad/nomad/drainer/watch_jobs_test.go

739 lines
21 KiB
Go

package drainer
import (
"context"
"testing"
"time"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/time/rate"
)
func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) {
n1 := mock.Node()
n1.Name = "draining"
n1.DrainStrategy = &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{
Deadline: time.Minute,
},
ForceDeadline: time.Now().Add(time.Minute),
}
require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 100, n1))
// Create a non-draining node
n2 := mock.Node()
n2.Name = "running"
require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 101, n2))
return n1, n2
}
func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) {
t.Helper()
limiter := rate.NewLimiter(100.0, 100)
logger := testlog.HCLogger(t)
ctx, cancel := context.WithCancel(context.Background())
w := NewDrainingJobWatcher(ctx, limiter, state, logger)
return w, cancel
}
// TestDrainingJobWatcher_Interface is a compile-time assertion that we
// implement the intended interface.
func TestDrainingJobWatcher_Interface(t *testing.T) {
w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t))
cancel()
var _ DrainingJobWatcher = w
}
// asertJobWatcherOps asserts a certain number of allocs are drained and/or
// migrated by the job watcher.
func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) (
*DrainRequest, []*structs.Allocation) {
t.Helper()
var (
drains *DrainRequest
migrations []*structs.Allocation
drainsChecked, migrationsChecked bool
)
for {
select {
case drains = <-jw.Drain():
ids := make([]string, len(drains.Allocs))
for i, a := range drains.Allocs {
ids[i] = a.JobID[:6] + ":" + a.ID[:6]
}
t.Logf("draining %d allocs: %v", len(ids), ids)
require.False(t, drainsChecked, "drains already received")
drainsChecked = true
require.Lenf(t, drains.Allocs, drained,
"expected %d drains but found %d", drained, len(drains.Allocs))
case migrations = <-jw.Migrated():
ids := make([]string, len(migrations))
for i, a := range migrations {
ids[i] = a.JobID[:6] + ":" + a.ID[:6]
}
t.Logf("migrating %d allocs: %v", len(ids), ids)
require.False(t, migrationsChecked, "migrations already received")
migrationsChecked = true
require.Lenf(t, migrations, migrated,
"expected %d migrations but found %d", migrated, len(migrations))
case <-time.After(10 * time.Millisecond):
if !drainsChecked && drained > 0 {
t.Fatalf("expected %d drains but none happened", drained)
}
if !migrationsChecked && migrated > 0 {
t.Fatalf("expected %d migrations but none happened", migrated)
}
return drains, migrations
}
}
}
// TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches
// allocation changes from multiple jobs.
func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
t.Parallel()
require := require.New(t)
state := state.TestStateStore(t)
jobWatcher, cancelWatcher := testDrainingJobWatcher(t, state)
defer cancelWatcher()
drainingNode, runningNode := testNodes(t, state)
var index uint64 = 101
count := 8
newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation {
a := mock.Alloc()
a.JobID = job.ID
a.Job = job
a.TaskGroup = job.TaskGroups[0].Name
a.NodeID = node.ID
return a
}
// 2 jobs with count 10, max parallel 3
jnss := make([]structs.NamespacedID, 2)
jobs := make([]*structs.Job, 2)
for i := 0; i < 2; i++ {
job := mock.Job()
jobs[i] = job
jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
job.TaskGroups[0].Migrate.MaxParallel = 3
job.TaskGroups[0].Count = count
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, index, job))
index++
var allocs []*structs.Allocation
for i := 0; i < count; i++ {
a := newAlloc(drainingNode, job)
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
}
allocs = append(allocs, a)
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, allocs))
index++
}
// Only register jobs with watcher after creating all data models as
// once the watcher starts we need to track the index carefully for
// updating the batch future
jobWatcher.RegisterJobs(jnss)
// Expect a first batch of MaxParallel allocs from each job
drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0)
// Fake migrating the drained allocs by starting new ones and stopping
// the old ones
drainedAllocs := make([]*structs.Allocation, len(drains.Allocs))
for i, a := range drains.Allocs {
a.DesiredTransition.Migrate = helper.BoolToPtr(true)
// create a copy so we can reuse this slice
drainedAllocs[i] = a.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
drains.Resp.Respond(index, nil)
index++
// Just setting ShouldMigrate should not cause any further drains
assertJobWatcherOps(t, jobWatcher, 0, 0)
// Proceed our fake migration along by creating new allocs and stopping
// old ones
replacements := make([]*structs.Allocation, len(drainedAllocs))
updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2)
for i, a := range drainedAllocs {
// Stop drained allocs
a.DesiredTransition.Migrate = nil
a.DesiredStatus = structs.AllocDesiredStatusStop
// Create a replacement
replacement := mock.Alloc()
replacement.JobID = a.Job.ID
replacement.Job = a.Job
replacement.TaskGroup = a.TaskGroup
replacement.NodeID = runningNode.ID
// start in pending state with no health status
updates = append(updates, a, replacement)
replacements[i] = replacement.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
index++
// The drained allocs stopping cause migrations but no new drains
// because the replacements have not started
assertJobWatcherOps(t, jobWatcher, 0, 6)
// Finally kickoff further drain activity by "starting" replacements
for _, a := range replacements {
a.ClientStatus = structs.AllocClientStatusRunning
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
}
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
index++
require.NotEmpty(jobWatcher.drainingJobs())
// 6 new drains
drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0)
// Fake migrations once more to finish the drain
drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
for i, a := range drains.Allocs {
a.DesiredTransition.Migrate = helper.BoolToPtr(true)
// create a copy so we can reuse this slice
drainedAllocs[i] = a.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
drains.Resp.Respond(index, nil)
index++
assertJobWatcherOps(t, jobWatcher, 0, 0)
replacements = make([]*structs.Allocation, len(drainedAllocs))
updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
for i, a := range drainedAllocs {
a.DesiredTransition.Migrate = nil
a.DesiredStatus = structs.AllocDesiredStatusStop
replacement := newAlloc(runningNode, a.Job)
updates = append(updates, a, replacement)
replacements[i] = replacement.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
index++
assertJobWatcherOps(t, jobWatcher, 0, 6)
for _, a := range replacements {
a.ClientStatus = structs.AllocClientStatusRunning
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
}
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
index++
require.NotEmpty(jobWatcher.drainingJobs())
// Final 4 new drains
drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0)
// Fake migrations once more to finish the drain
drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
for i, a := range drains.Allocs {
a.DesiredTransition.Migrate = helper.BoolToPtr(true)
// create a copy so we can reuse this slice
drainedAllocs[i] = a.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
drains.Resp.Respond(index, nil)
index++
assertJobWatcherOps(t, jobWatcher, 0, 0)
replacements = make([]*structs.Allocation, len(drainedAllocs))
updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
for i, a := range drainedAllocs {
a.DesiredTransition.Migrate = nil
a.DesiredStatus = structs.AllocDesiredStatusStop
replacement := newAlloc(runningNode, a.Job)
updates = append(updates, a, replacement)
replacements[i] = replacement.Copy()
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
index++
assertJobWatcherOps(t, jobWatcher, 0, 4)
for _, a := range replacements {
a.ClientStatus = structs.AllocClientStatusRunning
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
}
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
// No jobs should be left!
require.Empty(jobWatcher.drainingJobs())
}
// DrainingJobWatcher tests:
// TODO Test that the watcher cancels its query when a new job is registered
// handleTaskGroupTestCase is the test case struct for TestHandleTaskGroup
//
// Two nodes will be initialized: one draining and one running.
type handleTaskGroupTestCase struct {
// Name of test
Name string
// Batch uses a batch job and alloc
Batch bool
// Expectations
ExpectedDrained int
ExpectedMigrated int
ExpectedDone bool
// Count overrides the default count of 10 if set
Count int
// MaxParallel overrides the default max_parallel of 1 if set
MaxParallel int
// AddAlloc will be called 10 times to create test allocs
//
// Allocs default to be healthy on the draining node
AddAlloc func(i int, a *structs.Allocation, drainingID, runningID string)
}
func TestHandeTaskGroup_Table(t *testing.T) {
cases := []handleTaskGroupTestCase{
{
// All allocs on draining node
Name: "AllDraining",
ExpectedDrained: 1,
ExpectedMigrated: 0,
ExpectedDone: false,
},
{
// All allocs on non-draining node
Name: "AllNonDraining",
ExpectedDrained: 0,
ExpectedMigrated: 0,
ExpectedDone: true,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
a.NodeID = runningID
},
},
{
// Some allocs on non-draining node but not healthy
Name: "SomeNonDrainingUnhealthy",
ExpectedDrained: 0,
ExpectedMigrated: 0,
ExpectedDone: false,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
if i%2 == 0 {
a.NodeID = runningID
a.DeploymentStatus = nil
}
},
},
{
// One draining, other allocs on non-draining node and healthy
Name: "OneDraining",
ExpectedDrained: 1,
ExpectedMigrated: 0,
ExpectedDone: false,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
if i != 0 {
a.NodeID = runningID
}
},
},
{
// One already draining, other allocs on non-draining node and healthy
Name: "OneAlreadyDraining",
ExpectedDrained: 0,
ExpectedMigrated: 0,
ExpectedDone: false,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
if i == 0 {
a.DesiredTransition.Migrate = helper.BoolToPtr(true)
return
}
a.NodeID = runningID
},
},
{
// One already drained, other allocs on non-draining node and healthy
Name: "OneAlreadyDrained",
ExpectedDrained: 0,
ExpectedMigrated: 1,
ExpectedDone: true,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
if i == 0 {
a.DesiredStatus = structs.AllocDesiredStatusStop
return
}
a.NodeID = runningID
},
},
{
// One already drained, other allocs on non-draining node and healthy
Name: "OneAlreadyDrainedBatched",
Batch: true,
ExpectedDrained: 0,
ExpectedMigrated: 1,
ExpectedDone: true,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
if i == 0 {
a.DesiredStatus = structs.AllocDesiredStatusStop
return
}
a.NodeID = runningID
},
},
{
// All allocs are terminl, nothing to be drained
Name: "AllMigrating",
ExpectedDrained: 0,
ExpectedMigrated: 10,
ExpectedDone: true,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
a.DesiredStatus = structs.AllocDesiredStatusStop
},
},
{
// All allocs are terminl, nothing to be drained
Name: "AllMigratingBatch",
Batch: true,
ExpectedDrained: 0,
ExpectedMigrated: 10,
ExpectedDone: true,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
a.DesiredStatus = structs.AllocDesiredStatusStop
},
},
{
// All allocs may be drained at once
Name: "AllAtOnce",
ExpectedDrained: 10,
ExpectedMigrated: 0,
ExpectedDone: false,
MaxParallel: 10,
},
{
// Drain 2
Name: "Drain2",
ExpectedDrained: 2,
ExpectedMigrated: 0,
ExpectedDone: false,
MaxParallel: 2,
},
{
// One on new node, one drained, and one draining
ExpectedDrained: 1,
ExpectedMigrated: 1,
MaxParallel: 2,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
switch i {
case 0:
// One alloc on running node
a.NodeID = runningID
case 1:
// One alloc already migrated
a.DesiredStatus = structs.AllocDesiredStatusStop
}
},
},
{
// 8 on new node, one drained, and one draining
ExpectedDrained: 1,
ExpectedMigrated: 1,
MaxParallel: 2,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
switch i {
case 0, 1, 2, 3, 4, 5, 6, 7:
a.NodeID = runningID
case 8:
a.DesiredStatus = structs.AllocDesiredStatusStop
}
},
},
{
// 5 on new node, two drained, and three draining
ExpectedDrained: 3,
ExpectedMigrated: 2,
MaxParallel: 5,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
switch i {
case 0, 1, 2, 3, 4:
a.NodeID = runningID
case 8, 9:
a.DesiredStatus = structs.AllocDesiredStatusStop
}
},
},
{
// Not all on new node have health set
Name: "PendingHealth",
ExpectedDrained: 1,
ExpectedMigrated: 1,
MaxParallel: 3,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
switch i {
case 0:
// Deployment status UNset for 1 on new node
a.NodeID = runningID
a.DeploymentStatus = nil
case 1, 2, 3, 4:
// Deployment status set for 4 on new node
a.NodeID = runningID
case 9:
a.DesiredStatus = structs.AllocDesiredStatusStop
}
},
},
{
// 5 max parallel - 1 migrating - 2 with unset health = 2 drainable
Name: "PendingHealthHigherMax",
ExpectedDrained: 2,
ExpectedMigrated: 1,
MaxParallel: 5,
AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
switch i {
case 0, 1:
// Deployment status UNset for 2 on new node
a.NodeID = runningID
a.DeploymentStatus = nil
case 2, 3, 4:
// Deployment status set for 3 on new node
a.NodeID = runningID
case 9:
a.DesiredStatus = structs.AllocDesiredStatusStop
}
},
},
}
for _, testCase := range cases {
t.Run(testCase.Name, func(t *testing.T) {
testHandleTaskGroup(t, testCase)
})
}
}
func testHandleTaskGroup(t *testing.T, tc handleTaskGroupTestCase) {
t.Parallel()
require := require.New(t)
assert := assert.New(t)
// Create nodes
state := state.TestStateStore(t)
drainingNode, runningNode := testNodes(t, state)
job := mock.Job()
if tc.Batch {
job = mock.BatchJob()
}
job.TaskGroups[0].Count = 10
if tc.Count > 0 {
job.TaskGroups[0].Count = tc.Count
}
if tc.MaxParallel > 0 {
job.TaskGroups[0].Migrate.MaxParallel = tc.MaxParallel
}
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 102, job))
var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
a := mock.Alloc()
if tc.Batch {
a = mock.BatchAlloc()
}
a.JobID = job.ID
a.Job = job
a.TaskGroup = job.TaskGroups[0].Name
// Default to being healthy on the draining node
a.NodeID = drainingNode.ID
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
}
if tc.AddAlloc != nil {
tc.AddAlloc(i, a, drainingNode.ID, runningNode.ID)
}
allocs = append(allocs, a)
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 103, allocs))
snap, err := state.Snapshot()
require.Nil(err)
res := newJobResult()
require.Nil(handleTaskGroup(snap, tc.Batch, job.TaskGroups[0], allocs, 102, res))
assert.Lenf(res.drain, tc.ExpectedDrained, "Drain expected %d but found: %d",
tc.ExpectedDrained, len(res.drain))
assert.Lenf(res.migrated, tc.ExpectedMigrated, "Migrate expected %d but found: %d",
tc.ExpectedMigrated, len(res.migrated))
assert.Equal(tc.ExpectedDone, res.done)
}
func TestHandleTaskGroup_Migrations(t *testing.T) {
t.Parallel()
require := require.New(t)
// Create a draining node
state := state.TestStateStore(t)
n := mock.Node()
n.DrainStrategy = &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{
Deadline: 5 * time.Minute,
},
ForceDeadline: time.Now().Add(1 * time.Minute),
}
require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
job := mock.Job()
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, job))
// Create 10 done allocs
var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
a := mock.Alloc()
a.Job = job
a.TaskGroup = job.TaskGroups[0].Name
a.NodeID = n.ID
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(false),
}
if i%2 == 0 {
a.DesiredStatus = structs.AllocDesiredStatusStop
} else {
a.ClientStatus = structs.AllocClientStatusFailed
}
allocs = append(allocs, a)
}
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
snap, err := state.Snapshot()
require.Nil(err)
// Handle before and after indexes as both service and batch
res := newJobResult()
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
require.Empty(res.drain)
require.Len(res.migrated, 10)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
require.Empty(res.drain)
require.Len(res.migrated, 10)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
require.Empty(res.drain)
require.Empty(res.migrated)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
require.Empty(res.drain)
require.Empty(res.migrated)
require.True(res.done)
}
// This test asserts that handle task group works when an allocation is on a
// garbage collected node
func TestHandleTaskGroup_GarbageCollectedNode(t *testing.T) {
t.Parallel()
require := require.New(t)
// Create a draining node
state := state.TestStateStore(t)
n := mock.Node()
n.DrainStrategy = &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{
Deadline: 5 * time.Minute,
},
ForceDeadline: time.Now().Add(1 * time.Minute),
}
require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
job := mock.Job()
require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, job))
// Create 10 done allocs
var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
a := mock.Alloc()
a.Job = job
a.TaskGroup = job.TaskGroups[0].Name
a.NodeID = n.ID
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(false),
}
if i%2 == 0 {
a.DesiredStatus = structs.AllocDesiredStatusStop
} else {
a.ClientStatus = structs.AllocClientStatusFailed
}
allocs = append(allocs, a)
}
// Make the first one be on a GC'd node
allocs[0].NodeID = uuid.Generate()
require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
snap, err := state.Snapshot()
require.Nil(err)
// Handle before and after indexes as both service and batch
res := newJobResult()
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
require.Empty(res.drain)
require.Len(res.migrated, 9)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
require.Empty(res.drain)
require.Len(res.migrated, 9)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
require.Empty(res.drain)
require.Empty(res.migrated)
require.True(res.done)
res = newJobResult()
require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
require.Empty(res.drain)
require.Empty(res.migrated)
require.True(res.done)
}