From 5117a22c30114b394fa3ce10aa6cb8299f7e265d Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Mon, 3 Feb 2020 16:03:33 -0500 Subject: [PATCH] add e2e test for system sched ineligible nodes --- e2e/bin/run | 2 +- e2e/bin/update | 2 +- e2e/e2e_test.go | 1 + e2e/e2eutil/utils.go | 2 + e2e/systemsched/input/system_job0.nomad | 53 +++++++++ e2e/systemsched/input/system_job1.nomad | 53 +++++++++ e2e/systemsched/systemsched.go | 136 ++++++++++++++++++++++++ 7 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 e2e/systemsched/input/system_job0.nomad create mode 100644 e2e/systemsched/input/system_job1.nomad create mode 100644 e2e/systemsched/systemsched.go diff --git a/e2e/bin/run b/e2e/bin/run index e4400c7b4..0bd1bc374 100755 --- a/e2e/bin/run +++ b/e2e/bin/run @@ -7,7 +7,7 @@ if [ "$1" == "" ]; then exit 1 fi -nodes=$(terraform output -json -state=terraform/terraform.tfstate | jq -r '(.clients,.servers).value[]') +nodes=$(terraform output -json -state=terraform/terraform.tfstate | jq -r '(.linux_clients,.servers).value[]') for node in $nodes do echo Executing: ssh -i terraform/keys/*.pem ubuntu@$node "$@" diff --git a/e2e/bin/update b/e2e/bin/update index 34c032cce..529b26f1a 100755 --- a/e2e/bin/update +++ b/e2e/bin/update @@ -7,7 +7,7 @@ fi set -e -nodes=$(terraform output -json -state=terraform/terraform.tfstate | jq -r '(.clients,.servers).value[]') +nodes=$(terraform output -json -state=terraform/terraform.tfstate | jq -r '(.linux_clients,.servers).value[]') for node in $nodes do echo Executing: scp -C -i terraform/keys/*.pem "$1" ubuntu@$node:"$2" diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 993494eb8..8e3d0bf75 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -20,6 +20,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/nomad09upgrade" _ "github.com/hashicorp/nomad/e2e/nomadexec" _ "github.com/hashicorp/nomad/e2e/spread" + _ "github.com/hashicorp/nomad/e2e/systemsched" _ "github.com/hashicorp/nomad/e2e/taskevents" ) diff --git a/e2e/e2eutil/utils.go b/e2e/e2eutil/utils.go index c9f204bab..816b8d9f0 100644 --- a/e2e/e2eutil/utils.go +++ b/e2e/e2eutil/utils.go @@ -5,6 +5,7 @@ import ( "testing" "time" + "github.com/davecgh/go-spew/spew" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/jobspec" @@ -144,6 +145,7 @@ func AllocIDsFromAllocationListStubs(allocs []*api.AllocationListStub) []string func DeploymentsForJob(t *testing.T, nomadClient *api.Client, jobID string) []*api.Deployment { ds, _, err := nomadClient.Deployments().List(nil) require.NoError(t, err) + spew.Dump(ds) out := []*api.Deployment{} for _, d := range ds { diff --git a/e2e/systemsched/input/system_job0.nomad b/e2e/systemsched/input/system_job0.nomad new file mode 100644 index 000000000..6c881e096 --- /dev/null +++ b/e2e/systemsched/input/system_job0.nomad @@ -0,0 +1,53 @@ +job "redis" { + datacenters = ["dc1"] + + type = "system" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "cache" { + count = 1 + + restart { + attempts = 10 + interval = "1m" + + delay = "2s" + mode = "delay" + } + + task "system_task" { + driver = "docker" + + config { + image = "bash:latest" + + command = "bash" + args = ["-c", "sleep 15000"] + } + + env { + version = "1" + } + + logs { + max_files = 1 + max_file_size = 9 + } + + resources { + cpu = 20 # 500 MHz + + memory = 40 # 256MB + + network { + mbits = 1 + port "db" {} + } + } + } + } +} diff --git a/e2e/systemsched/input/system_job1.nomad b/e2e/systemsched/input/system_job1.nomad new file mode 100644 index 000000000..aae1729c6 --- /dev/null +++ b/e2e/systemsched/input/system_job1.nomad @@ -0,0 +1,53 @@ +job "redis" { + datacenters = ["dc1"] + + type = "system" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "cache" { + count = 1 + + restart { + attempts = 10 + interval = "1m" + + delay = "2s" + mode = "delay" + } + + task "system_task" { + driver = "docker" + + config { + image = "bash:latest" + + command = "bash" + args = ["-c", "sleep 15000"] + } + + env { + version = "2" + } + + logs { + max_files = 1 + max_file_size = 9 + } + + resources { + cpu = 20 # 500 MHz + + memory = 40 # 256MB + + network { + mbits = 1 + port "db" {} + } + } + } + } +} diff --git a/e2e/systemsched/systemsched.go b/e2e/systemsched/systemsched.go new file mode 100644 index 000000000..8fb851420 --- /dev/null +++ b/e2e/systemsched/systemsched.go @@ -0,0 +1,136 @@ +package systemsched + +import ( + "fmt" + "time" + + "github.com/davecgh/go-spew/spew" + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" +) + +type SystemSchedTest struct { + framework.TC + jobIDs []string + disabledNodeID string +} + +func init() { + framework.AddSuites(&framework.TestSuite{ + Component: "SystemScheduler", + CanRunLocal: true, + Cases: []framework.TestCase{ + new(SystemSchedTest), + }, + }) +} + +func (tc *SystemSchedTest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4) +} + +func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + jobID := "system_deployment" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job0.nomad", jobID, "") + + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + + // Mark one node as ineligible + nodesAPI := tc.Nomad().Nodes() + disabledNodeID := allocs[0].NodeID + _, err = nodesAPI.ToggleEligibility(disabledNodeID, false, nil) + require.NoError(t, err) + + // Assert all jobs still running + jobs = nomadClient.Jobs() + allocs, _, err = jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + var disabledAlloc *api.AllocationListStub + + for _, alloc := range allocs { + // Ensure alloc is either running or complete + testutil.WaitForResultRetries(30, func() (bool, error) { + time.Sleep(time.Millisecond * 100) + alloc, _, err := nomadClient.Allocations().Info(alloc.ID, nil) + if err != nil { + return false, err + } + + return (alloc.ClientStatus == structs.AllocClientStatusRunning || + alloc.ClientStatus == structs.AllocClientStatusComplete), + fmt.Errorf("expected status running, but was: %s", alloc.ClientStatus) + }, func(err error) { + t.Fatalf("failed to wait on alloc: %v", err) + }) + if alloc.NodeID == disabledNodeID { + require.Equal(t, "run", alloc.DesiredStatus) + disabledAlloc = alloc + } + } + + require.NotNil(t, disabledAlloc) + + // Update job + spew.Dump("DREW UPDATE JOB") + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job1.nomad", jobID, "") + + // Get updated allocations + jobs = nomadClient.Jobs() + allocs, _, err = jobs.Allocations(jobID, false, nil) + require.NoError(t, err) + var allocIDs []string + for _, alloc := range allocs { + allocIDs = append(allocIDs, alloc.ID) + } + + e2eutil.WaitForAllocsRunning(t, nomadClient, allocIDs) + + allocs, _, err = jobs.Allocations(jobID, false, nil) + require.NoError(t, err) + + // Ensure disabled node alloc is still version 0 + var foundPreviousAlloc bool + for _, alloc := range allocs { + spew.Dump(alloc) + if alloc.ID == disabledAlloc.ID { + foundPreviousAlloc = true + require.Equal(t, uint64(0), alloc.JobVersion) + } else { + spew.Dump(alloc) + require.Equal(t, uint64(1), alloc.JobVersion) + } + require.Equal(t, "run", alloc.DesiredStatus) + } + require.True(t, foundPreviousAlloc, "unable to find previous alloc for ineligible node") +} + +func (tc *SystemSchedTest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + + // Mark all nodes eligible + nodesAPI := tc.Nomad().Nodes() + nodes, _, _ := nodesAPI.List(nil) + for _, node := range nodes { + nodesAPI.ToggleEligibility(node.ID, true, nil) + } + + jobs := nomadClient.Jobs() + // Stop all jobs in test + for _, id := range tc.jobIDs { + jobs.Deregister(id, true, nil) + } + tc.jobIDs = []string{} + // Garbage collect + nomadClient.System().GarbageCollect() +}