481f91034c
* e2e/csi: wait longer for plugins to become healthy Plugins are Docker containers, and as such sometimes we get delays in startup due to pulling from the registry and this is a source of test flakiness. Give the plugins a little longer to start up. * e2e/csi: version bump for AWS EBS plugins
469 lines
14 KiB
Go
469 lines
14 KiB
Go
package csi
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
)
|
|
|
|
type CSIVolumesTest struct {
|
|
framework.TC
|
|
testJobIDs []string
|
|
volumeIDs []string
|
|
pluginJobIDs []string
|
|
}
|
|
|
|
func init() {
|
|
framework.AddSuites(&framework.TestSuite{
|
|
Component: "CSI",
|
|
CanRunLocal: true,
|
|
Consul: false,
|
|
Cases: []framework.TestCase{
|
|
new(CSIVolumesTest),
|
|
},
|
|
})
|
|
}
|
|
|
|
const ns = ""
|
|
|
|
var pluginWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
|
|
var reapWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
|
|
|
|
func (tc *CSIVolumesTest) BeforeAll(f *framework.F) {
|
|
t := f.T()
|
|
|
|
_, err := os.Stat("csi/input/volume-ebs.hcl")
|
|
if err != nil {
|
|
t.Skip("skipping CSI test because EBS volume spec file missing:", err)
|
|
}
|
|
|
|
_, err = os.Stat("csi/input/volume-efs.hcl")
|
|
if err != nil {
|
|
t.Skip("skipping CSI test because EFS volume spec file missing:", err)
|
|
}
|
|
|
|
// Ensure cluster has leader and at least two client
|
|
// nodes in a ready state before running tests
|
|
e2e.WaitForLeader(t, tc.Nomad())
|
|
e2e.WaitForNodesReady(t, tc.Nomad(), 2)
|
|
}
|
|
|
|
// TestEBSVolumeClaim launches AWS EBS plugins and registers an EBS volume
|
|
// as a Nomad CSI volume. We then deploy a job that writes to the volume,
|
|
// stop that job, and reuse the volume for another job which should be able
|
|
// to read the data written by the first job.
|
|
func (tc *CSIVolumesTest) TestEBSVolumeClaim(f *framework.F) {
|
|
t := f.T()
|
|
require := require.New(t)
|
|
nomadClient := tc.Nomad()
|
|
uuid := uuid.Generate()
|
|
pluginID := "aws-ebs0"
|
|
|
|
// deploy the controller plugin job
|
|
controllerJobID := "aws-ebs-plugin-controller-" + uuid[0:8]
|
|
f.NoError(e2e.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
|
|
tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
|
|
expected := []string{"running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(controllerJobID, ns, expected),
|
|
"job should be running")
|
|
|
|
// deploy the node plugins job
|
|
nodesJobID := "aws-ebs-plugin-nodes-" + uuid[0:8]
|
|
f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
|
|
tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
|
|
|
|
f.NoError(e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status != "running" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, nil,
|
|
))
|
|
|
|
f.NoError(waitForPluginStatusControllerCount(pluginID, 2, pluginWait),
|
|
"aws-ebs0 controller plugins did not become healthy")
|
|
f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
|
|
"aws-ebs0 node plugins did not become healthy")
|
|
|
|
// register a volume
|
|
// TODO: we don't have a unique ID threaded thru the jobspec yet
|
|
volID := "ebs-vol0"
|
|
err := volumeRegister(volID, "csi/input/volume-ebs.hcl")
|
|
require.NoError(err)
|
|
tc.volumeIDs = append(tc.volumeIDs, volID)
|
|
|
|
// deploy a job that writes to the volume
|
|
writeJobID := "write-ebs-" + uuid[0:8]
|
|
f.NoError(e2e.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err := e2e.AllocsForJob(writeJobID, ns)
|
|
f.NoError(err, "could not get allocs for write job")
|
|
f.Len(allocs, 1, "could not get allocs for write job")
|
|
writeAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert the writer wrote a file to it
|
|
expectedPath := "/task/test/" + writeAllocID
|
|
_, err = readFile(nomadClient, writeAllocID, expectedPath)
|
|
require.NoError(err)
|
|
|
|
// Shutdown (and purge) the writer so we can run a reader.
|
|
// we could mount the EBS volume with multi-attach, but we
|
|
// want this test to exercise the unpublish workflow.
|
|
_, err = e2e.Command("nomad", "job", "stop", "-purge", writeJobID)
|
|
require.NoError(err)
|
|
|
|
// wait for the volume unpublish workflow to complete
|
|
require.NoError(waitForVolumeClaimRelease(volID, reapWait),
|
|
"write-ebs alloc claim was not released")
|
|
|
|
// deploy a job so we can read from the volume
|
|
readJobID := "read-ebs-" + uuid[0:8]
|
|
tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
|
|
f.NoError(e2e.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err = e2e.AllocsForJob(readJobID, ns)
|
|
f.NoError(err, "could not get allocs for read job")
|
|
f.Len(allocs, 1, "could not get allocs for read job")
|
|
readAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert we can read the file the writer wrote
|
|
expectedPath = "/task/test/" + readAllocID
|
|
_, err = readFile(nomadClient, readAllocID, expectedPath)
|
|
require.NoError(err)
|
|
|
|
}
|
|
|
|
// TestEFSVolumeClaim launches AWS EFS plugins and registers an EFS volume
|
|
// as a Nomad CSI volume. We then deploy a job that writes to the volume,
|
|
// and share the volume with another job which should be able to read the
|
|
// data written by the first job.
|
|
func (tc *CSIVolumesTest) TestEFSVolumeClaim(f *framework.F) {
|
|
t := f.T()
|
|
require := require.New(t)
|
|
nomadClient := tc.Nomad()
|
|
uuid := uuid.Generate()
|
|
pluginID := "aws-efs0"
|
|
|
|
// deploy the node plugins job (no need for a controller for EFS)
|
|
nodesJobID := "aws-efs-plugin-nodes-" + uuid[0:8]
|
|
f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-efs-nodes.nomad"))
|
|
tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
|
|
|
|
f.NoError(e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status != "running" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, nil,
|
|
))
|
|
|
|
f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
|
|
"aws-efs0 node plugins did not become healthy")
|
|
|
|
// register a volume
|
|
volID := "efs-vol0"
|
|
err := volumeRegister(volID, "csi/input/volume-efs.hcl")
|
|
require.NoError(err)
|
|
tc.volumeIDs = append(tc.volumeIDs, volID)
|
|
|
|
// deploy a job that writes to the volume
|
|
writeJobID := "write-efs-" + uuid[0:8]
|
|
tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
|
|
f.NoError(e2e.Register(writeJobID, "csi/input/use-efs-volume-write.nomad"))
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err := e2e.AllocsForJob(writeJobID, ns)
|
|
f.NoError(err, "could not get allocs for write job")
|
|
f.Len(allocs, 1, "could not get allocs for write job")
|
|
writeAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert the writer wrote a file to it
|
|
expectedPath := "/task/test/" + writeAllocID
|
|
_, err = readFile(nomadClient, writeAllocID, expectedPath)
|
|
require.NoError(err)
|
|
|
|
// Shutdown the writer so we can run a reader.
|
|
// although EFS should support multiple readers, the plugin
|
|
// does not.
|
|
_, err = e2e.Command("nomad", "job", "stop", writeJobID)
|
|
require.NoError(err)
|
|
|
|
// wait for the volume unpublish workflow to complete
|
|
require.NoError(waitForVolumeClaimRelease(volID, reapWait),
|
|
"write-efs alloc claim was not released")
|
|
|
|
// deploy a job that reads from the volume
|
|
readJobID := "read-efs-" + uuid[0:8]
|
|
tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
|
|
f.NoError(e2e.Register(readJobID, "csi/input/use-efs-volume-read.nomad"))
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
|
|
"job should be running")
|
|
|
|
allocs, err = e2e.AllocsForJob(readJobID, ns)
|
|
f.NoError(err, "could not get allocs for read job")
|
|
f.Len(allocs, 1, "could not get allocs for read job")
|
|
readAllocID := allocs[0]["ID"]
|
|
|
|
// read data from volume and assert the writer wrote a file to it
|
|
require.NoError(err)
|
|
_, err = readFile(nomadClient, readAllocID, expectedPath)
|
|
require.NoError(err)
|
|
}
|
|
|
|
func (tc *CSIVolumesTest) AfterEach(f *framework.F) {
|
|
|
|
// Stop all jobs in test
|
|
for _, id := range tc.testJobIDs {
|
|
out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
|
|
f.Assert().NoError(err, out)
|
|
}
|
|
tc.testJobIDs = []string{}
|
|
|
|
// Deregister all volumes in test
|
|
for _, id := range tc.volumeIDs {
|
|
// make sure all the test jobs have finished unpublishing claims
|
|
err := waitForVolumeClaimRelease(id, reapWait)
|
|
f.Assert().NoError(err, "volume claims were not released")
|
|
|
|
out, err := e2e.Command("nomad", "volume", "deregister", id)
|
|
if err != nil {
|
|
fmt.Println("could not deregister volume, dumping allocation logs")
|
|
f.Assert().NoError(tc.dumpLogs())
|
|
}
|
|
f.Assert().NoError(err, out)
|
|
}
|
|
tc.volumeIDs = []string{}
|
|
|
|
// Deregister all plugin jobs in test
|
|
for _, id := range tc.pluginJobIDs {
|
|
out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
|
|
f.Assert().NoError(err, out)
|
|
}
|
|
tc.pluginJobIDs = []string{}
|
|
|
|
// Garbage collect
|
|
out, err := e2e.Command("nomad", "system", "gc")
|
|
f.Assert().NoError(err, out)
|
|
}
|
|
|
|
// waitForVolumeClaimRelease makes sure we don't try to re-claim a volume
|
|
// that's in the process of being unpublished. we can't just wait for allocs
|
|
// to stop, but need to wait for their claims to be released
|
|
func waitForVolumeClaimRelease(volID string, wc *e2e.WaitConfig) error {
|
|
var out string
|
|
var err error
|
|
testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
|
|
time.Sleep(wc.Interval)
|
|
out, err = e2e.Command("nomad", "volume", "status", volID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
section, err := e2e.GetSection(out, "Allocations")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return strings.Contains(section, "No allocations placed"), nil
|
|
}, func(e error) {
|
|
if e == nil {
|
|
err = nil
|
|
}
|
|
err = fmt.Errorf("alloc claim was not released: %v\n%s", e, out)
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (tc *CSIVolumesTest) dumpLogs() error {
|
|
|
|
for _, id := range tc.pluginJobIDs {
|
|
allocs, err := e2e.AllocsForJob(id, ns)
|
|
if err != nil {
|
|
return fmt.Errorf("could not find allocs for plugin: %v", err)
|
|
}
|
|
for _, alloc := range allocs {
|
|
allocID := alloc["ID"]
|
|
out, err := e2e.AllocLogs(allocID, e2e.LogsStdErr)
|
|
if err != nil {
|
|
return fmt.Errorf("could not get logs for alloc: %v\n%s", err, out)
|
|
}
|
|
_, isCI := os.LookupEnv("CI")
|
|
if isCI {
|
|
fmt.Println("--------------------------------------")
|
|
fmt.Println("allocation logs:", allocID)
|
|
fmt.Println(out)
|
|
continue
|
|
}
|
|
f, err := os.Create(allocID + ".log")
|
|
if err != nil {
|
|
return fmt.Errorf("could not create log file: %v", err)
|
|
}
|
|
defer f.Close()
|
|
_, err = f.WriteString(out)
|
|
if err != nil {
|
|
return fmt.Errorf("could not write to log file: %v", err)
|
|
}
|
|
fmt.Printf("nomad alloc logs written to %s.log\n", allocID)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TODO(tgross): replace this w/ AllocFS().Stat() after
|
|
// https://github.com/hashicorp/nomad/issues/7365 is fixed
|
|
func readFile(client *api.Client, allocID string, path string) (bytes.Buffer, error) {
|
|
var stdout, stderr bytes.Buffer
|
|
alloc, _, err := client.Allocations().Info(allocID, nil)
|
|
if err != nil {
|
|
return stdout, err
|
|
}
|
|
ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancelFn()
|
|
|
|
_, err = client.Allocations().Exec(ctx,
|
|
alloc, "task", false,
|
|
[]string{"cat", path},
|
|
os.Stdin, &stdout, &stderr,
|
|
make(chan api.TerminalSize), nil)
|
|
return stdout, err
|
|
}
|
|
|
|
func waitForPluginStatusMinNodeCount(pluginID string, minCount int, wc *e2e.WaitConfig) error {
|
|
|
|
return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
|
|
expected, err := e2e.GetField(out, "Nodes Expected")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if expectedCount < minCount {
|
|
return false, fmt.Errorf(
|
|
"expected Nodes Expected >= %d, got %q", minCount, expected)
|
|
}
|
|
healthy, err := e2e.GetField(out, "Nodes Healthy")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if healthy != expected {
|
|
return false, fmt.Errorf(
|
|
"expected Nodes Healthy >= %d, got %q", minCount, healthy)
|
|
}
|
|
return true, nil
|
|
}, wc)
|
|
}
|
|
|
|
func waitForPluginStatusControllerCount(pluginID string, count int, wc *e2e.WaitConfig) error {
|
|
|
|
return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
|
|
|
|
expected, err := e2e.GetField(out, "Controllers Expected")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if expectedCount != count {
|
|
return false, fmt.Errorf(
|
|
"expected Controllers Expected = %d, got %d", count, expectedCount)
|
|
}
|
|
healthy, err := e2e.GetField(out, "Controllers Healthy")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
healthyCount, err := strconv.Atoi(strings.TrimSpace(healthy))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if healthyCount != count {
|
|
return false, fmt.Errorf(
|
|
"expected Controllers Healthy = %d, got %d", count, healthyCount)
|
|
}
|
|
return true, nil
|
|
|
|
}, wc)
|
|
}
|
|
|
|
func waitForPluginStatusCompare(pluginID string, compare func(got string) (bool, error), wc *e2e.WaitConfig) error {
|
|
var err error
|
|
testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
|
|
time.Sleep(wc.Interval)
|
|
out, err := e2e.Command("nomad", "plugin", "status", pluginID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return compare(out)
|
|
}, func(e error) {
|
|
err = fmt.Errorf("plugin status check failed: %v", e)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// VolumeRegister registers a jobspec from a file but with a unique ID.
|
|
// The caller is responsible for recording that ID for later cleanup.
|
|
func volumeRegister(volID, volFilePath string) error {
|
|
|
|
cmd := exec.Command("nomad", "volume", "register", "-")
|
|
stdin, err := cmd.StdinPipe()
|
|
if err != nil {
|
|
return fmt.Errorf("could not open stdin?: %w", err)
|
|
}
|
|
|
|
content, err := ioutil.ReadFile(volFilePath)
|
|
if err != nil {
|
|
return fmt.Errorf("could not open vol file: %w", err)
|
|
}
|
|
|
|
// hack off the first line to replace with our unique ID
|
|
var re = regexp.MustCompile(`(?m)^id ".*"`)
|
|
volspec := re.ReplaceAllString(string(content),
|
|
fmt.Sprintf("id = \"%s\"", volID))
|
|
|
|
go func() {
|
|
defer stdin.Close()
|
|
io.WriteString(stdin, volspec)
|
|
}()
|
|
|
|
out, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return fmt.Errorf("could not register vol: %w\n%v", err, string(out))
|
|
}
|
|
return nil
|
|
}
|