a3a86a849a
The `TestDrainer_AllTypes_NoDeadline` test has been flaky. It looks like this might be because the final update of batch allocations to complete is improperly updating the state store directly rather than by RPC. If the service jobs have restarted in the meantime, the `allocClientStateSimulator` will have updated the index on the allocations table and that will prevent the drainer from unblocking (and being marked complete) when the batch jobs are written with an earlier index. This changeset attempts to fix that by making the update via RPC (as it normally would be in real code).
940 lines
28 KiB
Go
940 lines
28 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package nomad
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
memdb "github.com/hashicorp/go-memdb"
|
|
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
|
|
"github.com/shoenig/test/must"
|
|
"github.com/shoenig/test/wait"
|
|
|
|
"github.com/hashicorp/nomad/ci"
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/drainer"
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
)
|
|
|
|
// allocClientStateSimulator simulates the updates in state from the
|
|
// client. service allocations that are new on the server get marked with
|
|
// healthy deployments, and service allocations that are DesiredStatus=stop on
|
|
// the server get updates with terminal client status.
|
|
func allocClientStateSimulator(t *testing.T, errCh chan<- error, ctx context.Context,
|
|
srv *Server, nodeID string, logger log.Logger) {
|
|
|
|
codec := rpcClient(t, srv)
|
|
store := srv.State()
|
|
|
|
nindex := uint64(1)
|
|
for {
|
|
allocs, index, err := getNodeAllocs(ctx, store, nodeID, nindex)
|
|
if err != nil {
|
|
if err == context.Canceled {
|
|
return
|
|
}
|
|
|
|
errCh <- fmt.Errorf("failed to get node allocs: %v", err)
|
|
return
|
|
}
|
|
nindex = index
|
|
|
|
// For each alloc that doesn't have its deployment status set, set it
|
|
var updates []*structs.Allocation
|
|
now := time.Now()
|
|
for _, alloc := range allocs {
|
|
if alloc.Job.Type != structs.JobTypeService {
|
|
continue
|
|
}
|
|
|
|
switch alloc.DesiredStatus {
|
|
case structs.AllocDesiredStatusRun:
|
|
if alloc.DeploymentStatus.HasHealth() {
|
|
continue // only update to healthy once
|
|
}
|
|
newAlloc := alloc.Copy()
|
|
newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Healthy: pointer.Of(true),
|
|
Timestamp: now,
|
|
}
|
|
updates = append(updates, newAlloc)
|
|
logger.Trace("marking deployment health for alloc", "alloc_id", alloc.ID)
|
|
|
|
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
|
if alloc.ClientStatus == structs.AllocClientStatusComplete {
|
|
continue // only update to complete once
|
|
}
|
|
newAlloc := alloc.Copy()
|
|
newAlloc.ClientStatus = structs.AllocClientStatusComplete
|
|
updates = append(updates, newAlloc)
|
|
logger.Trace("marking alloc complete", "alloc_id", alloc.ID)
|
|
}
|
|
|
|
}
|
|
|
|
if len(updates) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Send the update
|
|
req := &structs.AllocUpdateRequest{
|
|
Alloc: updates,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var resp structs.GenericResponse
|
|
if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil {
|
|
if ctx.Err() == context.Canceled {
|
|
return
|
|
} else if err != nil {
|
|
errCh <- err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllocPromoter is a small helper to return an error or nil from an error
|
|
// chan like the one given to the allocClientStateSimulator goroutine.
|
|
func checkAllocPromoter(errCh chan error) error {
|
|
select {
|
|
case err := <-errCh:
|
|
return err
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func getNodeAllocs(ctx context.Context, store *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) {
|
|
resp, index, err := store.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
return resp.([]*structs.Allocation), index, nil
|
|
}
|
|
|
|
func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, store *state.StateStore) (interface{}, uint64, error) {
|
|
return func(ws memdb.WatchSet, store *state.StateStore) (interface{}, uint64, error) {
|
|
// Capture all the allocations
|
|
allocs, err := store.AllocsByNode(ws, nodeID)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
// Use the last index that affected the jobs table
|
|
index, err := store.Index("allocs")
|
|
if err != nil {
|
|
return nil, index, err
|
|
}
|
|
|
|
return allocs, index, nil
|
|
}
|
|
}
|
|
|
|
func TestDrainer_Simple_ServiceOnly(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a job that runs on that node
|
|
job := mock.Job()
|
|
job.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Wait for the two allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 2)
|
|
|
|
// Create the second node
|
|
n2 := mock.Node()
|
|
nodeReg = &structs.NodeRegisterRequest{
|
|
Node: n2,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the first node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 10 * time.Minute,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 2)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n2.ID, srv.logger)
|
|
|
|
// Wait for the allocs to be replaced
|
|
waitForAllocsStop(t, store, n1.ID, nil)
|
|
waitForPlacedAllocs(t, store, n2.ID, 2)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, errCh, 3, "")
|
|
}
|
|
|
|
func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a job that runs on it
|
|
job := mock.Job()
|
|
job.Update = *structs.DefaultUpdateStrategy
|
|
job.Update.Stagger = 30 * time.Second
|
|
job.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Wait for the two allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 2)
|
|
|
|
// Drain the node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 1 * time.Second,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Wait for the allocs to be stopped (but not replaced)
|
|
waitForAllocsStop(t, store, n1.ID, nil)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, nil, 3, drainer.NodeDrainEventDetailDeadlined)
|
|
}
|
|
|
|
func TestDrainer_DrainEmptyNode(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create an empty node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 10 * time.Minute,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, nil, 3, "")
|
|
}
|
|
|
|
func TestDrainer_AllTypes_Deadline(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a service job that runs on it
|
|
job := mock.Job()
|
|
job.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Create a system job
|
|
sysjob := mock.SystemJob()
|
|
req = &structs.JobRegisterRequest{
|
|
Job: sysjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Create a batch job
|
|
bjob := mock.BatchJob()
|
|
bjob.TaskGroups[0].Count = 2
|
|
req = &structs.JobRegisterRequest{
|
|
Job: bjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Wait for all the allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 5)
|
|
|
|
// Create a second node
|
|
n2 := mock.Node()
|
|
nodeReg = &structs.NodeRegisterRequest{
|
|
Node: n2,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the first node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 2 * time.Second,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 2)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n2.ID, srv.logger)
|
|
|
|
// Wait for allocs to be replaced
|
|
finalAllocs := waitForAllocsStop(t, store, n1.ID, nil)
|
|
waitForPlacedAllocs(t, store, n2.ID, 5)
|
|
|
|
// Assert that the service finished before the batch and system
|
|
var serviceMax, batchMax uint64 = 0, 0
|
|
for _, alloc := range finalAllocs {
|
|
if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax {
|
|
serviceMax = alloc.ModifyIndex
|
|
} else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax {
|
|
batchMax = alloc.ModifyIndex
|
|
}
|
|
}
|
|
must.Less(t, batchMax, serviceMax)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, nil, 3, drainer.NodeDrainEventDetailDeadlined)
|
|
}
|
|
|
|
// Test that drain is unset when batch jobs naturally finish
|
|
func TestDrainer_AllTypes_NoDeadline(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create two nodes, registering the second later
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a service job
|
|
job := mock.Job()
|
|
job.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Create a system job
|
|
sysjob := mock.SystemJob()
|
|
req = &structs.JobRegisterRequest{
|
|
Job: sysjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Create a batch job
|
|
bjob := mock.BatchJob()
|
|
bjob.TaskGroups[0].Count = 2
|
|
req = &structs.JobRegisterRequest{
|
|
Job: bjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Wait for all the allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 5)
|
|
|
|
// Create a second node
|
|
n2 := mock.Node()
|
|
nodeReg = &structs.NodeRegisterRequest{
|
|
Node: n2,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the first node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 0 * time.Second, // Infinite
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 2)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n2.ID, srv.logger)
|
|
|
|
// Wait for the service allocs (only) to be stopped on the draining node
|
|
must.Wait(t, wait.InitialSuccess(wait.ErrorFunc(func() error {
|
|
allocs, err := store.AllocsByJob(nil, job.Namespace, job.ID, false)
|
|
must.NoError(t, err)
|
|
for _, alloc := range allocs {
|
|
if alloc.NodeID != n1.ID {
|
|
continue
|
|
}
|
|
if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
|
|
return fmt.Errorf("got desired status %v", alloc.DesiredStatus)
|
|
}
|
|
}
|
|
return checkAllocPromoter(errCh)
|
|
}),
|
|
wait.Timeout(10*time.Second),
|
|
wait.Gap(100*time.Millisecond),
|
|
))
|
|
|
|
// Mark the batch allocations as finished
|
|
allocs, err := store.AllocsByJob(nil, job.Namespace, bjob.ID, false)
|
|
must.NoError(t, err)
|
|
|
|
var updates []*structs.Allocation
|
|
for _, alloc := range allocs {
|
|
new := alloc.Copy()
|
|
new.ClientStatus = structs.AllocClientStatusComplete
|
|
updates = append(updates, new)
|
|
}
|
|
|
|
batchDoneReq := &structs.AllocUpdateRequest{
|
|
Alloc: updates,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", batchDoneReq, &resp)
|
|
must.NoError(t, err)
|
|
|
|
// Wait for the service allocations to be replaced
|
|
waitForPlacedAllocs(t, store, n2.ID, 3)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, errCh, 3, "")
|
|
}
|
|
|
|
func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a service job that runs on just one
|
|
job := mock.Job()
|
|
job.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
job.CreateIndex = resp.JobModifyIndex
|
|
|
|
// Create a system job
|
|
sysjob := mock.SystemJob()
|
|
req = &structs.JobRegisterRequest{
|
|
Job: sysjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
sysjob.CreateIndex = resp.JobModifyIndex
|
|
|
|
// Create a batch job
|
|
bjob := mock.BatchJob()
|
|
bjob.TaskGroups[0].Count = 2
|
|
req = &structs.JobRegisterRequest{
|
|
Job: bjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
bjob.CreateIndex = resp.JobModifyIndex
|
|
|
|
// Wait for the allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 5)
|
|
|
|
// Create some old terminal allocs for each job that point at a non-existent
|
|
// node to simulate it being on a GC'd node.
|
|
var badAllocs []*structs.Allocation
|
|
for _, job := range []*structs.Job{job, sysjob, bjob} {
|
|
alloc := mock.Alloc()
|
|
alloc.Namespace = job.Namespace
|
|
alloc.Job = job
|
|
alloc.JobID = job.ID
|
|
alloc.NodeID = uuid.Generate()
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
|
alloc.ClientStatus = structs.AllocClientStatusComplete
|
|
badAllocs = append(badAllocs, alloc)
|
|
}
|
|
must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, 1, badAllocs))
|
|
|
|
// Create the second node
|
|
n2 := mock.Node()
|
|
nodeReg = &structs.NodeRegisterRequest{
|
|
Node: n2,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the first node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 2 * time.Second,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 2)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n2.ID, srv.logger)
|
|
|
|
// Wait for the allocs to be replaced
|
|
waitForAllocsStop(t, store, n1.ID, errCh)
|
|
waitForPlacedAllocs(t, store, n2.ID, 5)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, errCh, 3, drainer.NodeDrainEventDetailDeadlined)
|
|
}
|
|
|
|
// TestDrainer_MultipleNSes_ServiceOnly asserts that all jobs on an alloc, even
|
|
// when they belong to different namespaces and share the same ID
|
|
func TestDrainer_MultipleNSes_ServiceOnly(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
nsrv, ns2 := mock.Namespace(), mock.Namespace()
|
|
nses := []*structs.Namespace{nsrv, ns2}
|
|
nsReg := &structs.NamespaceUpsertRequest{
|
|
Namespaces: nses,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nsResp structs.GenericResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Namespace.UpsertNamespaces", nsReg, &nsResp))
|
|
|
|
for _, ns := range nses {
|
|
// Create a job for each namespace
|
|
job := mock.Job()
|
|
job.ID = "example"
|
|
job.Name = "example"
|
|
job.Namespace = ns.Name
|
|
job.TaskGroups[0].Count = 1
|
|
req := &structs.JobRegisterRequest{
|
|
Job: job,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: job.Namespace,
|
|
},
|
|
}
|
|
|
|
// Fetch the response
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
}
|
|
|
|
// Wait for the two allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 2)
|
|
|
|
// Create the second node
|
|
n2 := mock.Node()
|
|
nodeReg = &structs.NodeRegisterRequest{
|
|
Node: n2,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Drain the first node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: 10 * time.Minute,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 2)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n2.ID, srv.logger)
|
|
|
|
// Wait for the allocs to be replaced
|
|
waitForAllocsStop(t, store, n1.ID, errCh)
|
|
waitForPlacedAllocs(t, store, n2.ID, 2)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, errCh, 3, "")
|
|
}
|
|
|
|
// Test that transitions to force drain work.
|
|
func TestDrainer_Batch_TransitionToForce(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
for _, inf := range []bool{true, false} {
|
|
name := "Infinite"
|
|
if !inf {
|
|
name = "Deadline"
|
|
}
|
|
t.Run(name, func(t *testing.T) {
|
|
srv, cleanupSrv := TestServer(t, nil)
|
|
defer cleanupSrv()
|
|
codec := rpcClient(t, srv)
|
|
testutil.WaitForLeader(t, srv.RPC)
|
|
store := srv.State()
|
|
|
|
// Create a node
|
|
n1 := mock.Node()
|
|
nodeReg := &structs.NodeRegisterRequest{
|
|
Node: n1,
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var nodeResp structs.NodeUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
|
|
|
|
// Create a batch job
|
|
bjob := mock.BatchJob()
|
|
bjob.TaskGroups[0].Count = 2
|
|
req := &structs.JobRegisterRequest{
|
|
Job: bjob,
|
|
WriteRequest: structs.WriteRequest{
|
|
Region: "global",
|
|
Namespace: bjob.Namespace,
|
|
},
|
|
}
|
|
|
|
// Fetch the response
|
|
var resp structs.JobRegisterResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
|
|
must.Positive(t, resp.Index)
|
|
|
|
// Wait for the allocations to be placed
|
|
waitForPlacedAllocs(t, store, n1.ID, 2)
|
|
|
|
// Pick the deadline
|
|
deadline := 0 * time.Second
|
|
if !inf {
|
|
deadline = 10 * time.Second
|
|
}
|
|
|
|
// Drain the node
|
|
drainReq := &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: deadline,
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
var drainResp structs.NodeDrainUpdateResponse
|
|
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Setup client simulator
|
|
errCh := make(chan error, 1)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go allocClientStateSimulator(t, errCh, ctx, srv, n1.ID, srv.logger)
|
|
|
|
// Make sure the batch job isn't affected
|
|
must.Wait(t, wait.ContinualSuccess(wait.ErrorFunc(func() error {
|
|
if err := checkAllocPromoter(errCh); err != nil {
|
|
return fmt.Errorf("check alloc promoter error: %v", err)
|
|
}
|
|
|
|
allocs, err := store.AllocsByNode(nil, n1.ID)
|
|
must.NoError(t, err)
|
|
for _, alloc := range allocs {
|
|
if alloc.DesiredStatus != structs.AllocDesiredStatusRun {
|
|
return fmt.Errorf("got status %v", alloc.DesiredStatus)
|
|
}
|
|
}
|
|
if len(allocs) != 2 {
|
|
return fmt.Errorf("expected 2 allocs but got %d", len(allocs))
|
|
}
|
|
return nil
|
|
}),
|
|
wait.Timeout(500*time.Millisecond),
|
|
wait.Gap(50*time.Millisecond),
|
|
))
|
|
|
|
// Force drain the node
|
|
drainReq = &structs.NodeUpdateDrainRequest{
|
|
NodeID: n1.ID,
|
|
DrainStrategy: &structs.DrainStrategy{
|
|
DrainSpec: structs.DrainSpec{
|
|
Deadline: -1 * time.Second, // Infinite
|
|
},
|
|
},
|
|
WriteRequest: structs.WriteRequest{Region: "global"},
|
|
}
|
|
must.NoError(t, msgpackrpc.CallWithCodec(
|
|
codec, "Node.UpdateDrain", drainReq, &drainResp))
|
|
|
|
// Make sure the batch job is migrated
|
|
waitForAllocsStop(t, store, n1.ID, errCh)
|
|
|
|
// Wait for the node drain to be marked complete with the events we expect
|
|
waitForNodeDrainComplete(t, store, n1.ID, errCh, 4,
|
|
drainer.NodeDrainEventDetailDeadlined)
|
|
|
|
})
|
|
}
|
|
}
|
|
|
|
// waitForNodeDrainComplete is a test helper that verifies the node drain has
|
|
// been removed and that the expected Node events have been written
|
|
func waitForNodeDrainComplete(t *testing.T, store *state.StateStore, nodeID string,
|
|
errCh chan error, expectEvents int, expectDetail string) {
|
|
t.Helper()
|
|
|
|
var node *structs.Node
|
|
|
|
must.Wait(t, wait.InitialSuccess(wait.ErrorFunc(func() error {
|
|
if err := checkAllocPromoter(errCh); err != nil {
|
|
return err
|
|
}
|
|
node, _ = store.NodeByID(nil, nodeID)
|
|
if node.DrainStrategy != nil {
|
|
return fmt.Errorf("has drain strategy still set")
|
|
}
|
|
// sometimes test gets a duplicate node drain complete event
|
|
if len(node.Events) < expectEvents {
|
|
return fmt.Errorf(
|
|
"did not get enough events (expected %d): %v", expectEvents, node.Events)
|
|
}
|
|
return nil
|
|
}),
|
|
wait.Timeout(10*time.Second),
|
|
wait.Gap(50*time.Millisecond),
|
|
))
|
|
|
|
must.Eq(t, drainer.NodeDrainEventComplete, node.Events[expectEvents-1].Message)
|
|
if expectDetail != "" {
|
|
must.MapContainsKey(t, node.Events[expectEvents-1].Details, expectDetail,
|
|
must.Sprintf("%#v", node.Events[expectEvents-1].Details),
|
|
)
|
|
}
|
|
}
|
|
|
|
func waitForPlacedAllocs(t *testing.T, store *state.StateStore, nodeID string, count int) {
|
|
t.Helper()
|
|
must.Wait(t, wait.InitialSuccess(
|
|
wait.BoolFunc(func() bool {
|
|
allocs, err := store.AllocsByNode(nil, nodeID)
|
|
must.NoError(t, err)
|
|
return len(allocs) == count
|
|
}),
|
|
wait.Timeout(10*time.Second),
|
|
wait.Gap(50*time.Millisecond),
|
|
))
|
|
}
|
|
|
|
// waitForAllocsStop waits for all allocs on the node to be stopped
|
|
func waitForAllocsStop(t *testing.T, store *state.StateStore, nodeID string, errCh chan error) []*structs.Allocation {
|
|
t.Helper()
|
|
var finalAllocs []*structs.Allocation
|
|
must.Wait(t, wait.InitialSuccess(
|
|
wait.ErrorFunc(func() error {
|
|
if err := checkAllocPromoter(errCh); err != nil {
|
|
return err
|
|
}
|
|
|
|
var err error
|
|
finalAllocs, err = store.AllocsByNode(nil, nodeID)
|
|
must.NoError(t, err)
|
|
for _, alloc := range finalAllocs {
|
|
if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
|
|
return fmt.Errorf("expected stop but got %s", alloc.DesiredStatus)
|
|
}
|
|
}
|
|
return nil
|
|
}),
|
|
wait.Timeout(10*time.Second),
|
|
wait.Gap(50*time.Millisecond),
|
|
))
|
|
|
|
return finalAllocs
|
|
}
|