dc013b5267
The CSI plugin allocations take a while to be marked healthy, sometimes causing E2E test flakes during the setup phase of the tests. There's nothing CSI specific about marking plugin allocs healthy, as the plugin supervisor hook does all the fingerprinting in the postrun hook (the prestart hook just makes a couple of empty directories). The timeouts we're seeing may be because of where we're pulling the images from; most our jobs pull from a CDN-backed public registry whereas these are pulling from ECR. Set a 1min timeout for these to make sure we have enough time to pull the image and start the task.
286 lines
9.5 KiB
Go
286 lines
9.5 KiB
Go
package csi
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/e2e/e2eutil"
|
|
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
)
|
|
|
|
// CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an
|
|
// example of a plugin that supports most of the CSI Controller RPCs.
|
|
type CSIControllerPluginEBSTest struct {
|
|
framework.TC
|
|
uuid string
|
|
testJobIDs []string
|
|
volumeIDs []string
|
|
pluginJobIDs []string
|
|
nodeIDs []string
|
|
}
|
|
|
|
const ebsPluginID = "aws-ebs0"
|
|
|
|
// BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and
|
|
// creates two EBS volumes for use in the test.
|
|
func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) {
|
|
e2eutil.WaitForLeader(f.T(), tc.Nomad())
|
|
e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
|
|
|
|
tc.uuid = uuid.Generate()[0:8]
|
|
|
|
// deploy the controller plugin job
|
|
controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid
|
|
f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
|
|
tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
|
|
|
|
f.NoError(e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(controllerJobID, ns) },
|
|
func(got []string) bool {
|
|
if len(got) != 2 {
|
|
return false
|
|
}
|
|
for _, status := range got {
|
|
if status != "running" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, pluginAllocWait,
|
|
), "plugin job should be running")
|
|
|
|
// deploy the node plugins job
|
|
nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
|
|
f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
|
|
tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
|
|
|
|
f.NoError(e2eutil.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status != "running" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, nil,
|
|
))
|
|
|
|
f.NoError(waitForPluginStatusControllerCount(ebsPluginID, 2, pluginWait),
|
|
"aws-ebs0 controller plugins did not become healthy")
|
|
f.NoError(waitForPluginStatusMinNodeCount(ebsPluginID, 2, pluginWait),
|
|
"aws-ebs0 node plugins did not become healthy")
|
|
|
|
// ideally we'd wait until after we check `nomad volume status -verbose`
|
|
// to verify these volumes are ready, but the plugin doesn't support the
|
|
// CSI ListVolumes RPC
|
|
volID := "ebs-vol[0]"
|
|
err := volumeRegister(volID, "csi/input/ebs-volume0.hcl", "create")
|
|
requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
|
|
tc.volumeIDs = append(tc.volumeIDs, volID)
|
|
|
|
volID = "ebs-vol[1]"
|
|
err = volumeRegister(volID, "csi/input/ebs-volume1.hcl", "create")
|
|
requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
|
|
tc.volumeIDs = append(tc.volumeIDs, volID)
|
|
}
|
|
|
|
func (tc *CSIControllerPluginEBSTest) AfterEach(f *framework.F) {
|
|
|
|
// Ensure nodes are all restored
|
|
for _, id := range tc.nodeIDs {
|
|
_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id)
|
|
f.Assert().NoError(err)
|
|
_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id)
|
|
f.Assert().NoError(err)
|
|
}
|
|
tc.nodeIDs = []string{}
|
|
|
|
// Stop all jobs in test
|
|
for _, id := range tc.testJobIDs {
|
|
err := e2eutil.StopJob(id, "-purge")
|
|
f.Assert().NoError(err)
|
|
}
|
|
tc.testJobIDs = []string{}
|
|
|
|
// Garbage collect
|
|
out, err := e2eutil.Command("nomad", "system", "gc")
|
|
f.Assert().NoError(err, out)
|
|
}
|
|
|
|
// AfterAll cleans up the volumes and plugin jobs created by the test.
|
|
func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) {
|
|
|
|
for _, volID := range tc.volumeIDs {
|
|
err := waitForVolumeClaimRelease(volID, reapWait)
|
|
f.Assert().NoError(err, "volume claims were not released")
|
|
|
|
out, err := e2eutil.Command("nomad", "volume", "delete", volID)
|
|
assertNoErrorElseDump(f, err,
|
|
fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs)
|
|
}
|
|
|
|
// Deregister all plugin jobs in test
|
|
for _, id := range tc.pluginJobIDs {
|
|
err := e2eutil.StopJob(id, "-purge")
|
|
f.Assert().NoError(err)
|
|
}
|
|
tc.pluginJobIDs = []string{}
|
|
|
|
// Garbage collect
|
|
out, err := e2eutil.Command("nomad", "system", "gc")
|
|
f.Assert().NoError(err, out)
|
|
|
|
}
|
|
|
|
// TestVolumeClaim exercises the volume publish/unpublish workflows for the
|
|
// EBS plugin.
|
|
func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
|
|
nomadClient := tc.Nomad()
|
|
|
|
// deploy a job that writes to the volume
|
|
writeJobID := "write-ebs-" + tc.uuid
|
|
f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
|
|
f.NoError(
|
|
e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
|
|
f.NoError(err, "could not get allocs for write job")
|
|
f.Len(allocs, 1, "could not get allocs for write job")
|
|
writeAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert the writer wrote a file to it
|
|
expectedPath := "/task/test/" + writeAllocID
|
|
_, err = readFile(nomadClient, writeAllocID, expectedPath)
|
|
f.NoError(err)
|
|
|
|
// Shutdown (and purge) the writer so we can run a reader.
|
|
// we could mount the EBS volume with multi-attach, but we
|
|
// want this test to exercise the unpublish workflow.
|
|
err = e2eutil.StopJob(writeJobID, "-purge")
|
|
f.NoError(err)
|
|
|
|
// wait for the volume unpublish workflow to complete
|
|
for _, volID := range tc.volumeIDs {
|
|
err := waitForVolumeClaimRelease(volID, reapWait)
|
|
f.NoError(err, "volume claims were not released")
|
|
}
|
|
|
|
// deploy a job so we can read from the volume
|
|
readJobID := "read-ebs-" + tc.uuid
|
|
tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
|
|
f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
|
|
f.NoError(
|
|
e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err = e2eutil.AllocsForJob(readJobID, ns)
|
|
f.NoError(err, "could not get allocs for read job")
|
|
f.Len(allocs, 1, "could not get allocs for read job")
|
|
readAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert we can read the file the writer wrote
|
|
expectedPath = "/task/test/" + readAllocID
|
|
_, err = readFile(nomadClient, readAllocID, expectedPath)
|
|
f.NoError(err)
|
|
}
|
|
|
|
// TestSnapshot exercises the snapshot commands.
|
|
func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) {
|
|
|
|
out, err := e2eutil.Command("nomad", "volume", "snapshot", "create",
|
|
tc.volumeIDs[0], "snap-"+tc.uuid)
|
|
requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs)
|
|
|
|
snaps, err := e2eutil.ParseColumns(out)
|
|
|
|
defer func() {
|
|
_, err := e2eutil.Command("nomad", "volume", "snapshot", "delete",
|
|
ebsPluginID, snaps[0]["Snapshot ID"])
|
|
requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs)
|
|
}()
|
|
|
|
f.NoError(err, fmt.Sprintf("could not parse output:\n%v", out))
|
|
f.Len(snaps, 1, fmt.Sprintf("could not parse output:\n%v", out))
|
|
|
|
// the snapshot we're looking for should be the first one because
|
|
// we just created it, but give us some breathing room to allow
|
|
// for concurrent test runs
|
|
out, err = e2eutil.Command("nomad", "volume", "snapshot", "list",
|
|
"-plugin", ebsPluginID, "-per-page", "10")
|
|
requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs)
|
|
f.Contains(out, snaps[0]["ID"],
|
|
fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out))
|
|
}
|
|
|
|
// TestNodeDrain exercises the remounting behavior in the face of a node drain
|
|
func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) {
|
|
|
|
nomadClient := tc.Nomad()
|
|
|
|
nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
|
|
pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns)
|
|
f.NoError(err)
|
|
expectedHealthyNodePlugins := len(pluginAllocs)
|
|
|
|
// deploy a job that writes to the volume
|
|
writeJobID := "write-ebs-for-drain" + tc.uuid
|
|
f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
|
|
f.NoError(
|
|
e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
|
|
|
|
allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
|
|
f.NoError(err, "could not get allocs for write job")
|
|
f.Len(allocs, 1, "could not get allocs for write job")
|
|
writeAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert the writer wrote a file to it
|
|
expectedPath := "/task/test/" + writeAllocID
|
|
_, err = readFile(nomadClient, writeAllocID, expectedPath)
|
|
f.NoError(err)
|
|
|
|
// intentionally set a long deadline so we can check the plugins
|
|
// haven't been moved
|
|
nodeID := allocs[0]["Node ID"]
|
|
out, err := e2eutil.Command("nomad", "node",
|
|
"drain", "-enable",
|
|
"-deadline", "10m",
|
|
"-yes", "-detach", nodeID)
|
|
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
|
|
tc.nodeIDs = append(tc.nodeIDs, nodeID)
|
|
|
|
wc := &e2eutil.WaitConfig{}
|
|
interval, retries := wc.OrDefault()
|
|
testutil.WaitForResultRetries(retries, func() (bool, error) {
|
|
time.Sleep(interval)
|
|
allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
for _, alloc := range allocs {
|
|
if alloc["ID"] != writeAllocID {
|
|
if alloc["Status"] == "running" {
|
|
return true, nil
|
|
}
|
|
if alloc["Status"] == "failed" {
|
|
// no point in waiting anymore if we hit this case
|
|
f.T().Fatal("expected replacement alloc not to fail")
|
|
}
|
|
}
|
|
}
|
|
return false, fmt.Errorf("expected replacement alloc to be running")
|
|
}, func(e error) {
|
|
err = e
|
|
})
|
|
|
|
pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns)
|
|
f.Lenf(pluginAllocs, expectedHealthyNodePlugins,
|
|
"expected node plugins to be unchanged, got: %v", pluginAllocs)
|
|
}
|