e2e: node drain tests (#8906)

Exercise the `nomad node drain` features, driving them via the new CLI helpers.
2020-09-21 11:52:11 -04:00 · 2020-09-21 11:52:11 -04:00 · 9cbc604308
parent 34093f7747
commit 9cbc604308
6 changed files with 483 additions and 0 deletions
--- a/e2e/e2e_test.go
+++ b/e2e/e2e_test.go
@ -19,6 +19,7 @@ import (
 	_ "github.com/hashicorp/nomad/e2e/hostvolumes"
 	_ "github.com/hashicorp/nomad/e2e/lifecycle"
 	_ "github.com/hashicorp/nomad/e2e/metrics"
 	_ "github.com/hashicorp/nomad/e2e/nodedrain"
 	_ "github.com/hashicorp/nomad/e2e/nomad09upgrade"
 	_ "github.com/hashicorp/nomad/e2e/nomadexec"
 	_ "github.com/hashicorp/nomad/e2e/podman"
--- a/e2e/nodedrain/input/drain_deadline.nomad
+++ b/e2e/nodedrain/input/drain_deadline.nomad
@ -0,0 +1,39 @@
 job "drain_deadline" {
  datacenters = ["dc1", "dc2"]
  constraint {
    attribute = "${attr.kernel.name}"
    value     = "linux"
  }
  group "group" {
    task "task" {
      driver = "docker"
      kill_timeout = "30s"
      config {
        image   = "busybox:1"
        command = "/bin/sh"
        args    = ["local/script.sh"]
      }
      template {
        data = <<EOF
 #!/bin/sh
 trap 'sleep 60' 2
 sleep 600
 EOF
        destination = "local/script.sh"
        change_mode = "noop"
      }
      resources {
        cpu    = 256
        memory = 128
      }
    }
  }
 }
--- a/e2e/nodedrain/input/drain_ignore_system.nomad
+++ b/e2e/nodedrain/input/drain_ignore_system.nomad
@ -0,0 +1,28 @@
 job "drain_ignore_system_service" {
  datacenters = ["dc1", "dc2"]
  type = "system"
  constraint {
    attribute = "${attr.kernel.name}"
    value     = "linux"
  }
  group "group" {
    task "task" {
      driver = "docker"
      config {
        image   = "busybox:1"
        command = "/bin/sh"
        args    = ["-c", "sleep 300"]
      }
      resources {
        cpu    = 256
        memory = 128
      }
    }
  }
 }
--- a/e2e/nodedrain/input/drain_migrate.nomad
+++ b/e2e/nodedrain/input/drain_migrate.nomad
@ -0,0 +1,46 @@
 job "drain_migrate" {
  datacenters = ["dc1", "dc2"]
  constraint {
    attribute = "${attr.kernel.name}"
    value     = "linux"
  }
  group "group" {
    ephemeral_disk {
      migrate = true
      size    = "101"
    }
    task "task" {
      driver = "docker"
      config {
        image   = "busybox:1"
        command = "/bin/sh"
        args    = ["local/test.sh"]
      }
      template {
        data = <<EOT
 #!/bin/sh
 if [ ! -f /alloc/data/{{ env "NOMAD_JOB_NAME" }} ]; then
  echo writing {{ env "NOMAD_ALLOC_ID" }} to /alloc/data/{{ env "NOMAD_JOB_NAME" }}
  echo {{ env "NOMAD_ALLOC_ID" }} > /alloc/data/{{ env "NOMAD_JOB_NAME" }}
 else
   echo /alloc/data/{{ env "NOMAD_JOB_NAME" }} already exists
 fi
 sleep 3600
 EOT
        destination = "local/test.sh"
      }
      resources {
        cpu    = 256
        memory = 128
      }
    }
  }
 }
--- a/e2e/nodedrain/input/drain_simple.nomad
+++ b/e2e/nodedrain/input/drain_simple.nomad
@ -0,0 +1,26 @@
 job "drain_simple" {
  datacenters = ["dc1", "dc2"]
  constraint {
    attribute = "${attr.kernel.name}"
    value     = "linux"
  }
  group "group" {
    task "task" {
      driver = "docker"
      config {
        image   = "busybox:1"
        command = "/bin/sh"
        args    = ["-c", "sleep 300"]
      }
      resources {
        cpu    = 256
        memory = 128
      }
    }
  }
 }
--- a/e2e/nodedrain/nodedrain.go
+++ b/e2e/nodedrain/nodedrain.go
@ -0,0 +1,343 @@
 package nodedrain
 import (
 	"fmt"
 	"os"
 	"strings"
 	"time"
 	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
 	"github.com/hashicorp/nomad/e2e/framework"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/testutil"
 )
 type NodeDrainE2ETest struct {
 	framework.TC
 	jobIDs  []string
 	nodeIDs []string
 }
 func init() {
 	framework.AddSuites(&framework.TestSuite{
 		Component:   "NodeDrain",
 		CanRunLocal: true,
 		Consul:      true,
 		Cases: []framework.TestCase{
 			new(NodeDrainE2ETest),
 		},
 	})
 }
 func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) {
 	e2e.WaitForLeader(f.T(), tc.Nomad())
 	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration
 }
 func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) {
 	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
 		return
 	}
 	for _, id := range tc.jobIDs {
 		_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
 		f.NoError(err)
 	}
 	tc.jobIDs = []string{}
 	for _, id := range tc.nodeIDs {
 		_, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id)
 		f.NoError(err)
 	}
 	tc.nodeIDs = []string{}
 	_, err := e2e.Command("nomad", "system", "gc")
 	f.NoError(err)
 }
 func nodesForJob(jobID string) ([]string, error) {
 	allocs, err := e2e.AllocsForJob(jobID)
 	if err != nil {
 		return nil, err
 	}
 	if len(allocs) < 1 {
 		return nil, fmt.Errorf("no allocs found for job: %v", jobID)
 	}
 	nodes := []string{}
 	for _, alloc := range allocs {
 		nodes = append(nodes, alloc["Node ID"])
 	}
 	return nodes, nil
 }
 // waitForNodeDrain is a convenience wrapper that polls 'node status'
 // until the comparison function over the state of the job's allocs on that
 // node returns true
 func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error {
 	var got []map[string]string
 	var err error
 	interval, retries := wc.OrDefault()
 	testutil.WaitForResultRetries(retries, func() (bool, error) {
 		time.Sleep(interval)
 		got, err = e2e.AllocsForNode(nodeID)
 		if err != nil {
 			return false, err
 		}
 		return comparison(got), nil
 	}, func(e error) {
 		err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got)
 	})
 	return err
 }
 // TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as
 // expected even during a node drain.
 func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
 	jobID := "test-node-drain-" + uuid.Generate()[0:8]
 	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad"))
 	tc.jobIDs = append(tc.jobIDs, jobID)
 	expected := []string{"running"}
 	f.NoError(e2e.WaitForAllocStatusExpected(jobID, expected), "job should be running")
 	allocs, err := e2e.AllocsForJob(jobID)
 	f.NoError(err, "could not get allocs for job")
 	f.Len(allocs, 1, "could not get allocs for job")
 	oldAllocID := allocs[0]["ID"]
 	nodes, err := nodesForJob(jobID)
 	f.NoError(err, "could not get nodes for job")
 	f.Len(nodes, 1, "could not get nodes for job")
 	nodeID := nodes[0]
 	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
 				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
 					return true
 				}
 			}
 			return false
 		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
 	), "node did not drain")
 	// wait for the allocation to be migrated
 	expected = []string{"running", "complete"}
 	f.NoError(e2e.WaitForAllocStatusExpected(jobID, expected), "job should be running")
 	allocs, err = e2e.AllocsForJob(jobID)
 	f.NoError(err, "could not get allocations for job")
 	// the task writes its alloc ID to a file if it hasn't been previously
 	// written, so find the contents of the migrated file and make sure they
 	// match the old allocation, not the running one
 	var got string
 	var fsErr error
 	testutil.WaitForResultRetries(500, func() (bool, error) {
 		time.Sleep(time.Millisecond * 100)
 		for _, alloc := range allocs {
 			if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID {
 				got, fsErr = e2e.Command("nomad", "alloc", "fs",
 					alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID))
 				if err != nil {
 					return false, err
 				}
 				if strings.TrimSpace(got) == oldAllocID {
 					return true, nil
 				} else {
 					return false, fmt.Errorf("expected %q, got %q", oldAllocID, got)
 				}
 			}
 		}
 		return false, fmt.Errorf("did not find a migrated alloc")
 	}, func(e error) {
 		fsErr = e
 	})
 	f.NoError(fsErr, "node drained but migration failed")
 }
 // TestNodeDrainIgnoreSystem tests that system jobs are left behind when the
 // -ignore-system flag is used.
 func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) {
 	nodes, err := e2e.NodeStatusListFiltered(
 		func(section string) bool {
 			kernelName, err := e2e.GetField(section, "kernel.name")
 			return err == nil && kernelName == "linux"
 		})
 	f.NoError(err, "could not get node status listing")
 	serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8]
 	systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8]
 	f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad"))
 	tc.jobIDs = append(tc.jobIDs, serviceJobID)
 	allocs, err := e2e.AllocsForJob(serviceJobID)
 	f.NoError(err, "could not get allocs for service job")
 	f.Len(allocs, 1, "could not get allocs for service job")
 	oldAllocID := allocs[0]["ID"]
 	f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad"))
 	tc.jobIDs = append(tc.jobIDs, systemJobID)
 	expected := []string{"running"}
 	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, expected),
 		"service job should be running")
 	// can't just give it a static list because the number of nodes can vary
 	f.NoError(
 		e2e.WaitForAllocStatusComparison(
 			func() ([]string, error) { return e2e.AllocStatuses(systemJobID) },
 			func(got []string) bool {
 				if len(got) != len(nodes) {
 					return false
 				}
 				for _, status := range got {
 					if status != "running" {
 						return false
 					}
 				}
 				return true
 			}, nil,
 		),
 		"system job should be running on every node",
 	)
 	jobNodes, err := nodesForJob(serviceJobID)
 	f.NoError(err, "could not get nodes for job")
 	f.Len(jobNodes, 1, "could not get nodes for job")
 	nodeID := jobNodes[0]
 	out, err := e2e.Command(
 		"nomad", "node", "drain",
 		"-ignore-system", "-enable", "-yes", "-detach", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
 				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
 					return true
 				}
 			}
 			return false
 		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
 	), "node did not drain")
 	allocs, err = e2e.AllocsForJob(systemJobID)
 	f.NoError(err, "could not query allocs for system job")
 	f.Equal(len(nodes), len(allocs), "system job should still be running on every node")
 	for _, alloc := range allocs {
 		f.Equal("run", alloc["Desired"], "no system allocs should be draining")
 		f.Equal("running", alloc["Status"], "no system allocs should be draining")
 	}
 }
 // TestNodeDrainDeadline tests the enforcement of the node drain deadline so
 // that allocations are terminated even if they haven't gracefully exited.
 func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
 	jobID := "test-node-drain-" + uuid.Generate()[0:8]
 	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
 	tc.jobIDs = append(tc.jobIDs, jobID)
 	expected := []string{"running"}
 	f.NoError(e2e.WaitForAllocStatusExpected(jobID, expected), "job should be running")
 	nodes, err := nodesForJob(jobID)
 	f.NoError(err, "could not get nodes for job")
 	f.Len(nodes, 1, "could not get nodes for job")
 	nodeID := nodes[0]
 	out, err := e2e.Command(
 		"nomad", "node", "drain",
 		"-deadline", "5s",
 		"-enable", "-yes", "-detach", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 	// the deadline is 5s but we can't guarantee its instantly terminated at
 	// that point, so we give it 10s which is well under the 30s kill_timeout in
 	// the job
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
 				if alloc["Status"] == "complete" {
 					return true
 				}
 			}
 			return false
 		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 100},
 	), "node did not drain immediately following deadline")
 }
 // TestNodeDrainDeadline tests the enforcement of the node drain -force flag
 // so that allocations are terminated immediately.
 func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 	jobID := "test-node-drain-" + uuid.Generate()[0:8]
 	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
 	tc.jobIDs = append(tc.jobIDs, jobID)
 	expected := []string{"running"}
 	f.NoError(e2e.WaitForAllocStatusExpected(jobID, expected), "job should be running")
 	nodes, err := nodesForJob(jobID)
 	f.NoError(err, "could not get nodes for job")
 	f.Len(nodes, 1, "could not get nodes for job")
 	nodeID := nodes[0]
 	out, err := e2e.Command(
 		"nomad", "node", "drain",
 		"-force",
 		"-enable", "-yes", "-detach", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 	// we've passed -force but we can't guarantee its instantly terminated at
 	// that point, so we give it 20s which is under the 30s kill_timeout in
 	// the job
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
 				if alloc["Status"] == "complete" {
 					return true
 				}
 			}
 			return false
 		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 200},
 	), "node did not drain immediately when forced")
 }
 // TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for
 // scheduling after disabling drain.
 func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) {
 	nodes, err := e2e.NodeStatusList()
 	f.NoError(err, "could not get node status listing")
 	nodeID := nodes[0]["ID"]
 	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 	_, err = e2e.Command(
 		"nomad", "node", "drain",
 		"-disable", "-keep-ineligible", "-yes", nodeID)
 	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
 	nodes, err = e2e.NodeStatusList()
 	f.NoError(err, "could not get updated node status listing")
 	f.Equal("ineligible", nodes[0]["Eligibility"])
 	f.Equal("false", nodes[0]["Drain"])
 }