E2E: clarify drain -deadline
and -force
flag behaviors (#16868)
The `-deadline` and `-force` flag for the `nomad node drain` command only cause the draining to ignore the `migrate` block's healthy deadline, max parallel, etc. These flags don't have anything to do with the `kill_timeout` or `shutdown_delay` options of the jobspec. This changeset fixes the skipped E2E tests so that they validate the intended behavior, and updates the docs for more clarity.
This commit is contained in:
parent
ec1a8ae12a
commit
4df2d9bda8
|
@ -24,7 +24,6 @@ import (
|
||||||
_ "github.com/hashicorp/nomad/e2e/lifecycle"
|
_ "github.com/hashicorp/nomad/e2e/lifecycle"
|
||||||
_ "github.com/hashicorp/nomad/e2e/metrics"
|
_ "github.com/hashicorp/nomad/e2e/metrics"
|
||||||
_ "github.com/hashicorp/nomad/e2e/networking"
|
_ "github.com/hashicorp/nomad/e2e/networking"
|
||||||
_ "github.com/hashicorp/nomad/e2e/nodedrain"
|
|
||||||
_ "github.com/hashicorp/nomad/e2e/nomadexec"
|
_ "github.com/hashicorp/nomad/e2e/nomadexec"
|
||||||
_ "github.com/hashicorp/nomad/e2e/oversubscription"
|
_ "github.com/hashicorp/nomad/e2e/oversubscription"
|
||||||
_ "github.com/hashicorp/nomad/e2e/parameterized"
|
_ "github.com/hashicorp/nomad/e2e/parameterized"
|
||||||
|
@ -45,6 +44,7 @@ import (
|
||||||
// we get a quick check that they compile on every commit
|
// we get a quick check that they compile on every commit
|
||||||
_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
|
_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
|
||||||
_ "github.com/hashicorp/nomad/e2e/namespaces"
|
_ "github.com/hashicorp/nomad/e2e/namespaces"
|
||||||
|
_ "github.com/hashicorp/nomad/e2e/nodedrain"
|
||||||
_ "github.com/hashicorp/nomad/e2e/volumes"
|
_ "github.com/hashicorp/nomad/e2e/volumes"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -2,40 +2,33 @@
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
job "drain_deadline" {
|
job "drain_deadline" {
|
||||||
datacenters = ["dc1", "dc2"]
|
|
||||||
|
|
||||||
constraint {
|
constraint {
|
||||||
attribute = "${attr.kernel.name}"
|
attribute = "${attr.kernel.name}"
|
||||||
value = "linux"
|
value = "linux"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
migrate {
|
||||||
|
max_parallel = 1
|
||||||
|
min_healthy_time = "30s"
|
||||||
|
}
|
||||||
|
|
||||||
group "group" {
|
group "group" {
|
||||||
|
|
||||||
|
count = 2
|
||||||
|
|
||||||
task "task" {
|
task "task" {
|
||||||
driver = "docker"
|
driver = "docker"
|
||||||
|
|
||||||
kill_timeout = "2m"
|
|
||||||
|
|
||||||
config {
|
config {
|
||||||
image = "busybox:1"
|
image = "busybox:1"
|
||||||
command = "/bin/sh"
|
command = "/bin/sh"
|
||||||
args = ["local/script.sh"]
|
args = ["-c", "sleep 600"]
|
||||||
}
|
|
||||||
|
|
||||||
template {
|
|
||||||
data = <<EOF
|
|
||||||
#!/bin/sh
|
|
||||||
trap 'sleep 60' 2
|
|
||||||
sleep 600
|
|
||||||
EOF
|
|
||||||
|
|
||||||
destination = "local/script.sh"
|
|
||||||
change_mode = "noop"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resources {
|
resources {
|
||||||
cpu = 256
|
cpu = 256
|
||||||
memory = 128
|
memory = 64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,8 @@ func TestNodeDrain(t *testing.T) {
|
||||||
t.Run("IgnoreSystem", testIgnoreSystem)
|
t.Run("IgnoreSystem", testIgnoreSystem)
|
||||||
t.Run("EphemeralMigrate", testEphemeralMigrate)
|
t.Run("EphemeralMigrate", testEphemeralMigrate)
|
||||||
t.Run("KeepIneligible", testKeepIneligible)
|
t.Run("KeepIneligible", testKeepIneligible)
|
||||||
|
t.Run("DeadlineFlag", testDeadlineFlag)
|
||||||
|
t.Run("ForceFlag", testForceFlag)
|
||||||
}
|
}
|
||||||
|
|
||||||
// testIgnoreSystem tests that system jobs are left behind when the
|
// testIgnoreSystem tests that system jobs are left behind when the
|
||||||
|
@ -51,12 +53,15 @@ func testIgnoreSystem(t *testing.T) {
|
||||||
// Run a system job, which will not be moved when we drain the node
|
// Run a system job, which will not be moved when we drain the node
|
||||||
systemJobID := "test-node-drain-system-" + uuid.Short()
|
systemJobID := "test-node-drain-system-" + uuid.Short()
|
||||||
t.Cleanup(cleanupJobState(t, systemJobID))
|
t.Cleanup(cleanupJobState(t, systemJobID))
|
||||||
registerAndWaitForRunning(t, nomadClient, systemJobID, "./input/drain_ignore_system.nomad", count)
|
|
||||||
|
must.NoError(t, e2eutil.Register(systemJobID, "./input/drain_ignore_system.nomad"))
|
||||||
|
waitForRunningAllocs(t, nomadClient, systemJobID, count)
|
||||||
|
|
||||||
// Also run a service job so we can verify when the drain is done
|
// Also run a service job so we can verify when the drain is done
|
||||||
serviceJobID := "test-node-drain-service-" + uuid.Short()
|
serviceJobID := "test-node-drain-service-" + uuid.Short()
|
||||||
t.Cleanup(cleanupJobState(t, serviceJobID))
|
t.Cleanup(cleanupJobState(t, serviceJobID))
|
||||||
serviceAllocs := registerAndWaitForRunning(t, nomadClient, serviceJobID, "./input/drain_simple.nomad", 1)
|
must.NoError(t, e2eutil.Register(serviceJobID, "./input/drain_simple.nomad"))
|
||||||
|
serviceAllocs := waitForRunningAllocs(t, nomadClient, serviceJobID, 1)
|
||||||
oldAllocID := serviceAllocs[0].ID
|
oldAllocID := serviceAllocs[0].ID
|
||||||
oldNodeID := serviceAllocs[0].NodeID
|
oldNodeID := serviceAllocs[0].NodeID
|
||||||
|
|
||||||
|
@ -67,7 +72,8 @@ func testIgnoreSystem(t *testing.T) {
|
||||||
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
|
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
|
||||||
|
|
||||||
// The service job should be drained
|
// The service job should be drained
|
||||||
newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
|
newAllocs := waitForAllocDrainComplete(t, nomadClient, serviceJobID,
|
||||||
|
oldAllocID, oldNodeID, time.Second*120)
|
||||||
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))
|
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))
|
||||||
|
|
||||||
// The system job should not have been drained
|
// The system job should not have been drained
|
||||||
|
@ -92,7 +98,8 @@ func testEphemeralMigrate(t *testing.T) {
|
||||||
nomadClient := e2eutil.NomadClient(t)
|
nomadClient := e2eutil.NomadClient(t)
|
||||||
jobID := "drain-migrate-" + uuid.Short()
|
jobID := "drain-migrate-" + uuid.Short()
|
||||||
|
|
||||||
allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_migrate.nomad", 1)
|
must.NoError(t, e2eutil.Register(jobID, "./input/drain_migrate.nomad"))
|
||||||
|
allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)
|
||||||
t.Cleanup(cleanupJobState(t, jobID))
|
t.Cleanup(cleanupJobState(t, jobID))
|
||||||
oldAllocID := allocs[0].ID
|
oldAllocID := allocs[0].ID
|
||||||
oldNodeID := allocs[0].NodeID
|
oldNodeID := allocs[0].NodeID
|
||||||
|
@ -117,7 +124,8 @@ func testEphemeralMigrate(t *testing.T) {
|
||||||
out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
|
out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
|
||||||
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
|
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
|
||||||
|
|
||||||
newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
|
newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
|
||||||
|
oldAllocID, oldNodeID, time.Second*120)
|
||||||
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
|
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
|
||||||
newAllocID := newAllocs[0].ID
|
newAllocID := newAllocs[0].ID
|
||||||
newNodeID := newAllocs[0].NodeID
|
newNodeID := newAllocs[0].NodeID
|
||||||
|
@ -176,42 +184,129 @@ func testKeepIneligible(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// registerAndWaitForRunning registers a job and waits for the expected number
|
// testDeadlineFlag tests the enforcement of the node drain deadline so that
|
||||||
// of allocations to be in a running state. Returns the allocations.
|
// allocations are moved even if max_parallel says we should be waiting
|
||||||
func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
|
func testDeadlineFlag(t *testing.T) {
|
||||||
|
|
||||||
|
nomadClient := e2eutil.NomadClient(t)
|
||||||
|
t.Cleanup(cleanupDrainState(t))
|
||||||
|
|
||||||
|
jobID := "test-node-drain-" + uuid.Short()
|
||||||
|
must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
|
||||||
|
allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)
|
||||||
|
|
||||||
|
t.Cleanup(cleanupJobState(t, jobID))
|
||||||
|
oldAllocID1 := allocs[0].ID
|
||||||
|
oldNodeID1 := allocs[0].NodeID
|
||||||
|
oldAllocID2 := allocs[1].ID
|
||||||
|
oldNodeID2 := allocs[1].NodeID
|
||||||
|
|
||||||
|
t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
|
||||||
|
out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
|
||||||
|
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
|
||||||
|
out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
|
||||||
|
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
|
||||||
|
|
||||||
|
out, err = e2eutil.Command(
|
||||||
|
"nomad", "node", "drain",
|
||||||
|
"-deadline", "1s",
|
||||||
|
"-enable", "-yes", "-detach", oldNodeID1)
|
||||||
|
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))
|
||||||
|
|
||||||
|
out, err = e2eutil.Command(
|
||||||
|
"nomad", "node", "drain",
|
||||||
|
"-deadline", "1s",
|
||||||
|
"-enable", "-yes", "-detach", oldNodeID2)
|
||||||
|
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))
|
||||||
|
|
||||||
|
// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
|
||||||
|
// for both to be marked complete. Instead, because of the -deadline flag
|
||||||
|
// we'll expect the allocs to be stoppped almost immediately (give it 10s to
|
||||||
|
// avoid flakiness), and then the new allocs should come up and get marked
|
||||||
|
// healthy after ~30s
|
||||||
|
t.Log("waiting for old allocs to stop")
|
||||||
|
waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)
|
||||||
|
|
||||||
|
t.Log("waiting for running allocs")
|
||||||
|
waitForRunningAllocs(t, nomadClient, jobID, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
// testForceFlag tests the enforcement of the node drain -force flag so that
|
||||||
|
// allocations are terminated immediately.
|
||||||
|
func testForceFlag(t *testing.T) {
|
||||||
|
|
||||||
|
nomadClient := e2eutil.NomadClient(t)
|
||||||
|
t.Cleanup(cleanupDrainState(t))
|
||||||
|
|
||||||
|
jobID := "test-node-drain-" + uuid.Short()
|
||||||
|
must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
|
||||||
|
allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)
|
||||||
|
|
||||||
|
t.Cleanup(cleanupJobState(t, jobID))
|
||||||
|
oldAllocID1 := allocs[0].ID
|
||||||
|
oldNodeID1 := allocs[0].NodeID
|
||||||
|
oldAllocID2 := allocs[1].ID
|
||||||
|
oldNodeID2 := allocs[1].NodeID
|
||||||
|
|
||||||
|
t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
|
||||||
|
out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
|
||||||
|
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
|
||||||
|
out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
|
||||||
|
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
|
||||||
|
|
||||||
|
out, err = e2eutil.Command(
|
||||||
|
"nomad", "node", "drain", "-force",
|
||||||
|
"-enable", "-yes", "-detach", oldNodeID1)
|
||||||
|
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))
|
||||||
|
|
||||||
|
out, err = e2eutil.Command(
|
||||||
|
"nomad", "node", "drain", "-force",
|
||||||
|
"-enable", "-yes", "-detach", oldNodeID2)
|
||||||
|
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))
|
||||||
|
|
||||||
|
// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
|
||||||
|
// for both to be marked complete. Instead, because of the -force flag
|
||||||
|
// we'll expect the allocs to be stoppped almost immediately (give it 10s to
|
||||||
|
// avoid flakiness), and then the new allocs should come up and get marked
|
||||||
|
// healthy after ~30s
|
||||||
|
t.Log("waiting for old allocs to stop")
|
||||||
|
waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)
|
||||||
|
|
||||||
|
t.Log("waiting for running allocs")
|
||||||
|
waitForRunningAllocs(t, nomadClient, jobID, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
func waitForRunningAllocs(t *testing.T, nomadClient *api.Client, jobID string, expectedRunningCount int) []*api.AllocationListStub {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
var allocs []*api.AllocationListStub
|
runningAllocs := set.From([]*api.AllocationListStub{})
|
||||||
var err error
|
|
||||||
must.NoError(t, e2eutil.Register(jobID, jobSpec))
|
|
||||||
|
|
||||||
must.Wait(t, wait.InitialSuccess(
|
must.Wait(t, wait.InitialSuccess(
|
||||||
wait.ErrorFunc(func() error {
|
wait.ErrorFunc(func() error {
|
||||||
allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
|
allocs, _, err := nomadClient.Jobs().Allocations(jobID, false, nil)
|
||||||
if err != nil {
|
must.NoError(t, err)
|
||||||
return fmt.Errorf("expected no error listing allocs: %v", err)
|
count := 0
|
||||||
}
|
|
||||||
if len(allocs) != expectedCount {
|
|
||||||
return fmt.Errorf("expected %d allocs but found %d", expectedCount, len(allocs))
|
|
||||||
}
|
|
||||||
for _, alloc := range allocs {
|
for _, alloc := range allocs {
|
||||||
if alloc.ClientStatus != structs.AllocClientStatusRunning {
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
||||||
return fmt.Errorf("alloc %q was %q, not running", alloc.ID, alloc.ClientStatus)
|
runningAllocs.Insert(alloc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if runningAllocs.Size() < expectedRunningCount {
|
||||||
|
return fmt.Errorf("expected %d running allocs, got %d", expectedRunningCount, count)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}),
|
}),
|
||||||
wait.Timeout(60*time.Second),
|
wait.Timeout(60*time.Second),
|
||||||
wait.Gap(500*time.Millisecond),
|
wait.Gap(500*time.Millisecond),
|
||||||
))
|
))
|
||||||
return allocs
|
return runningAllocs.Slice()
|
||||||
}
|
}
|
||||||
|
|
||||||
// waitForAllocDrain polls the allocation statues for a job until we've finished
|
// waitForAllocDrainComplete polls the allocation statues for a job until we've finished
|
||||||
// migrating:
|
// migrating:
|
||||||
// - the old alloc should be stopped
|
// - the old alloc should be stopped
|
||||||
// - the new alloc should be running
|
// - the new alloc should be running
|
||||||
func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {
|
func waitForAllocDrainComplete(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string, deadline time.Duration) []*api.AllocationListStub {
|
||||||
|
|
||||||
t.Helper()
|
t.Helper()
|
||||||
newAllocs := set.From([]*api.AllocationListStub{})
|
newAllocs := set.From([]*api.AllocationListStub{})
|
||||||
|
@ -243,16 +338,45 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
|
||||||
oldNodeID[:8], time.Now().Sub(start))
|
oldNodeID[:8], time.Now().Sub(start))
|
||||||
return nil
|
return nil
|
||||||
}),
|
}),
|
||||||
wait.Timeout(120*time.Second),
|
wait.Timeout(deadline),
|
||||||
wait.Gap(500*time.Millisecond),
|
wait.Gap(500*time.Millisecond),
|
||||||
))
|
))
|
||||||
|
|
||||||
return newAllocs.Slice()
|
return newAllocs.Slice()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// waitForAllocsStop polls the allocation statues for specific allocations until
|
||||||
|
// they've stopped
|
||||||
|
func waitForAllocsStop(t *testing.T, nomadClient *api.Client, deadline time.Duration, oldAllocIDs ...string) {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
must.Wait(t, wait.InitialSuccess(
|
||||||
|
wait.ErrorFunc(func() error {
|
||||||
|
for _, allocID := range oldAllocIDs {
|
||||||
|
alloc, _, err := nomadClient.Allocations().Info(allocID, nil)
|
||||||
|
must.NoError(t, err)
|
||||||
|
if alloc.ClientStatus != structs.AllocClientStatusComplete {
|
||||||
|
return fmt.Errorf("expected alloc %s to be complete, got %q",
|
||||||
|
allocID[:8], alloc.ClientStatus)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}),
|
||||||
|
wait.Timeout(deadline),
|
||||||
|
wait.Gap(500*time.Millisecond),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
func cleanupJobState(t *testing.T, jobID string) func() {
|
func cleanupJobState(t *testing.T, jobID string) func() {
|
||||||
return func() {
|
return func() {
|
||||||
_, err := e2eutil.Command("nomad", "job", "stop", "-purge", jobID)
|
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// we can't use the CLI here because some tests will stop the job during
|
||||||
|
// a running deployment, which returns a non-zero exit code
|
||||||
|
nomadClient := e2eutil.NomadClient(t)
|
||||||
|
_, _, err := nomadClient.Jobs().Deregister(jobID, true, nil)
|
||||||
test.NoError(t, err)
|
test.NoError(t, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,183 +0,0 @@
|
||||||
// Copyright (c) HashiCorp, Inc.
|
|
||||||
// SPDX-License-Identifier: MPL-2.0
|
|
||||||
|
|
||||||
package nodedrain
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
|
|
||||||
"github.com/hashicorp/nomad/e2e/framework"
|
|
||||||
"github.com/hashicorp/nomad/helper/uuid"
|
|
||||||
"github.com/hashicorp/nomad/testutil"
|
|
||||||
)
|
|
||||||
|
|
||||||
const ns = ""
|
|
||||||
|
|
||||||
type NodeDrainE2ETest struct {
|
|
||||||
framework.TC
|
|
||||||
jobIDs []string
|
|
||||||
nodeIDs []string
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
framework.AddSuites(&framework.TestSuite{
|
|
||||||
Component: "NodeDrain",
|
|
||||||
CanRunLocal: true,
|
|
||||||
Consul: true,
|
|
||||||
Cases: []framework.TestCase{
|
|
||||||
new(NodeDrainE2ETest),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) {
|
|
||||||
e2e.WaitForLeader(f.T(), tc.Nomad())
|
|
||||||
e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration
|
|
||||||
}
|
|
||||||
|
|
||||||
func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) {
|
|
||||||
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, id := range tc.jobIDs {
|
|
||||||
_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
|
|
||||||
f.Assert().NoError(err)
|
|
||||||
}
|
|
||||||
tc.jobIDs = []string{}
|
|
||||||
|
|
||||||
for _, id := range tc.nodeIDs {
|
|
||||||
_, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id)
|
|
||||||
f.Assert().NoError(err)
|
|
||||||
_, err = e2e.Command("nomad", "node", "eligibility", "-enable", id)
|
|
||||||
f.Assert().NoError(err)
|
|
||||||
}
|
|
||||||
tc.nodeIDs = []string{}
|
|
||||||
|
|
||||||
_, err := e2e.Command("nomad", "system", "gc")
|
|
||||||
f.Assert().NoError(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
func nodesForJob(jobID string) ([]string, error) {
|
|
||||||
allocs, err := e2e.AllocsForJob(jobID, ns)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if len(allocs) < 1 {
|
|
||||||
return nil, fmt.Errorf("no allocs found for job: %v", jobID)
|
|
||||||
}
|
|
||||||
nodes := []string{}
|
|
||||||
for _, alloc := range allocs {
|
|
||||||
nodes = append(nodes, alloc["Node ID"])
|
|
||||||
}
|
|
||||||
return nodes, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// waitForNodeDrain is a convenience wrapper that polls 'node status'
|
|
||||||
// until the comparison function over the state of the job's allocs on that
|
|
||||||
// node returns true
|
|
||||||
func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error {
|
|
||||||
var got []map[string]string
|
|
||||||
var err error
|
|
||||||
interval, retries := wc.OrDefault()
|
|
||||||
testutil.WaitForResultRetries(retries, func() (bool, error) {
|
|
||||||
time.Sleep(interval)
|
|
||||||
got, err = e2e.AllocsForNode(nodeID)
|
|
||||||
if err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
return comparison(got), nil
|
|
||||||
}, func(e error) {
|
|
||||||
err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got)
|
|
||||||
})
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestNodeDrainDeadline tests the enforcement of the node drain deadline so
|
|
||||||
// that allocations are terminated even if they haven't gracefully exited.
|
|
||||||
func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
|
|
||||||
f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902")
|
|
||||||
|
|
||||||
jobID := "test-node-drain-" + uuid.Generate()[0:8]
|
|
||||||
f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
|
|
||||||
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
||||||
|
|
||||||
expected := []string{"running"}
|
|
||||||
f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
|
|
||||||
|
|
||||||
nodes, err := nodesForJob(jobID)
|
|
||||||
f.NoError(err, "could not get nodes for job")
|
|
||||||
f.Len(nodes, 1, "could not get nodes for job")
|
|
||||||
nodeID := nodes[0]
|
|
||||||
|
|
||||||
f.T().Logf("draining node %v", nodeID)
|
|
||||||
out, err := e2e.Command(
|
|
||||||
"nomad", "node", "drain",
|
|
||||||
"-deadline", "5s",
|
|
||||||
"-enable", "-yes", "-detach", nodeID)
|
|
||||||
f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
|
|
||||||
tc.nodeIDs = append(tc.nodeIDs, nodeID)
|
|
||||||
|
|
||||||
// the deadline is 40s but we can't guarantee its instantly terminated at
|
|
||||||
// that point, so we give it 30s which is well under the 2m kill_timeout in
|
|
||||||
// the job.
|
|
||||||
// deadline here needs to account for scheduling and propagation delays.
|
|
||||||
f.NoError(waitForNodeDrain(nodeID,
|
|
||||||
func(got []map[string]string) bool {
|
|
||||||
// FIXME: check the drain job alloc specifically. test
|
|
||||||
// may pass if client had another completed alloc
|
|
||||||
for _, alloc := range got {
|
|
||||||
if alloc["Status"] == "complete" {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
|
|
||||||
), "node did not drain immediately following deadline")
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestNodeDrainForce tests the enforcement of the node drain -force flag so
|
|
||||||
// that allocations are terminated immediately.
|
|
||||||
func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
|
|
||||||
f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902")
|
|
||||||
|
|
||||||
jobID := "test-node-drain-" + uuid.Generate()[0:8]
|
|
||||||
f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
|
|
||||||
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
||||||
|
|
||||||
expected := []string{"running"}
|
|
||||||
f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
|
|
||||||
|
|
||||||
nodes, err := nodesForJob(jobID)
|
|
||||||
f.NoError(err, "could not get nodes for job")
|
|
||||||
f.Len(nodes, 1, "could not get nodes for job")
|
|
||||||
nodeID := nodes[0]
|
|
||||||
|
|
||||||
out, err := e2e.Command(
|
|
||||||
"nomad", "node", "drain",
|
|
||||||
"-force",
|
|
||||||
"-enable", "-yes", "-detach", nodeID)
|
|
||||||
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
|
|
||||||
tc.nodeIDs = append(tc.nodeIDs, nodeID)
|
|
||||||
|
|
||||||
// we've passed -force but we can't guarantee its instantly terminated at
|
|
||||||
// that point, so we give it 30s which is under the 2m kill_timeout in
|
|
||||||
// the job
|
|
||||||
f.NoError(waitForNodeDrain(nodeID,
|
|
||||||
func(got []map[string]string) bool {
|
|
||||||
// FIXME: check the drain job alloc specifically. test
|
|
||||||
// may pass if client had another completed alloc
|
|
||||||
for _, alloc := range got {
|
|
||||||
if alloc["Status"] == "complete" {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
|
|
||||||
), "node did not drain immediately when forced")
|
|
||||||
|
|
||||||
}
|
|
|
@ -57,17 +57,19 @@ capability.
|
||||||
- `-disable`: Disable node drain mode.
|
- `-disable`: Disable node drain mode.
|
||||||
|
|
||||||
- `-deadline`: Set the deadline by which all allocations must be moved off the
|
- `-deadline`: Set the deadline by which all allocations must be moved off the
|
||||||
node. Remaining allocations after the deadline are force removed from the
|
node. Remaining allocations after the deadline are removed from the node,
|
||||||
node. Defaults to 1 hour.
|
regardless of their [`migrate`][] block. Defaults to 1 hour.
|
||||||
|
|
||||||
- `-detach`: Return immediately instead of entering monitor mode.
|
- `-detach`: Return immediately instead of entering monitor mode.
|
||||||
|
|
||||||
- `-monitor`: Enter monitor mode directly without modifying the drain status.
|
- `-monitor`: Enter monitor mode directly without modifying the drain status.
|
||||||
|
|
||||||
- `-force`: Force remove allocations off the node immediately.
|
- `-force`: Remove allocations off the node immediately, regardless of the
|
||||||
|
allocation's [`migrate`][] block.
|
||||||
|
|
||||||
- `-no-deadline`: No deadline allows the allocations to drain off the node
|
- `-no-deadline`: No deadline allows the allocations to drain off the node,
|
||||||
without being force stopped after a certain deadline.
|
ignoring the default 1 hour deadline before allocations are removed regardless
|
||||||
|
of their [`migrate`][] block.
|
||||||
|
|
||||||
- `-ignore-system`: Ignore system allows the drain to complete without
|
- `-ignore-system`: Ignore system allows the drain to complete without
|
||||||
stopping system job allocations. By default system jobs (and CSI
|
stopping system job allocations. By default system jobs (and CSI
|
||||||
|
|
Loading…
Reference in a new issue