dc013b5267
The CSI plugin allocations take a while to be marked healthy, sometimes causing E2E test flakes during the setup phase of the tests. There's nothing CSI specific about marking plugin allocs healthy, as the plugin supervisor hook does all the fingerprinting in the postrun hook (the prestart hook just makes a couple of empty directories). The timeouts we're seeing may be because of where we're pulling the images from; most our jobs pull from a CDN-backed public registry whereas these are pulling from ECR. Set a 1min timeout for these to make sure we have enough time to pull the image and start the task.
262 lines
7.4 KiB
Go
262 lines
7.4 KiB
Go
package csi
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
)
|
|
|
|
func init() {
|
|
framework.AddSuites(&framework.TestSuite{
|
|
Component: "CSI",
|
|
CanRunLocal: true,
|
|
Consul: false,
|
|
Cases: []framework.TestCase{
|
|
new(CSIControllerPluginEBSTest), // see ebs.go
|
|
new(CSINodeOnlyPluginEFSTest), // see efs.go
|
|
},
|
|
})
|
|
}
|
|
|
|
const ns = ""
|
|
|
|
var pluginAllocWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 12} // 1min
|
|
var pluginWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
|
|
var reapWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
|
|
|
|
// assertNoErrorElseDump calls a non-halting assert on the error and dumps the
|
|
// plugin logs if it fails.
|
|
func assertNoErrorElseDump(f *framework.F, err error, msg string, pluginJobIDs []string) {
|
|
if err != nil {
|
|
dumpLogs(pluginJobIDs)
|
|
f.Assert().NoError(err, fmt.Sprintf("%v: %v", msg, err))
|
|
}
|
|
}
|
|
|
|
// requireNoErrorElseDump calls a halting assert on the error and dumps the
|
|
// plugin logs if it fails.
|
|
func requireNoErrorElseDump(f *framework.F, err error, msg string, pluginJobIDs []string) {
|
|
if err != nil {
|
|
dumpLogs(pluginJobIDs)
|
|
f.NoError(err, fmt.Sprintf("%v: %v", msg, err))
|
|
}
|
|
}
|
|
|
|
func dumpLogs(pluginIDs []string) error {
|
|
|
|
for _, id := range pluginIDs {
|
|
allocs, err := e2e.AllocsForJob(id, ns)
|
|
if err != nil {
|
|
return fmt.Errorf("could not find allocs for plugin: %v", err)
|
|
}
|
|
for _, alloc := range allocs {
|
|
allocID := alloc["ID"]
|
|
out, err := e2e.AllocLogs(allocID, e2e.LogsStdErr)
|
|
if err != nil {
|
|
return fmt.Errorf("could not get logs for alloc: %v\n%s", err, out)
|
|
}
|
|
_, isCI := os.LookupEnv("CI")
|
|
if isCI {
|
|
fmt.Println("--------------------------------------")
|
|
fmt.Println("allocation logs:", allocID)
|
|
fmt.Println(out)
|
|
continue
|
|
}
|
|
f, err := os.Create(allocID + ".log")
|
|
if err != nil {
|
|
return fmt.Errorf("could not create log file: %v", err)
|
|
}
|
|
defer f.Close()
|
|
_, err = f.WriteString(out)
|
|
if err != nil {
|
|
return fmt.Errorf("could not write to log file: %v", err)
|
|
}
|
|
fmt.Printf("nomad alloc logs written to %s.log\n", allocID)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// waitForVolumeClaimRelease makes sure we don't try to re-claim a volume
|
|
// that's in the process of being unpublished. we can't just wait for allocs
|
|
// to stop, but need to wait for their claims to be released
|
|
func waitForVolumeClaimRelease(volID string, wc *e2e.WaitConfig) error {
|
|
var out string
|
|
var err error
|
|
testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
|
|
time.Sleep(wc.Interval)
|
|
out, err = e2e.Command("nomad", "volume", "status", volID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
section, err := e2e.GetSection(out, "Allocations")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return strings.Contains(section, "No allocations placed"), nil
|
|
}, func(e error) {
|
|
if e == nil {
|
|
err = nil
|
|
}
|
|
err = fmt.Errorf("alloc claim was not released: %v\n%s", e, out)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// TODO(tgross): replace this w/ AllocFS().Stat() after
|
|
// https://github.com/hashicorp/nomad/issues/7365 is fixed
|
|
func readFile(client *api.Client, allocID string, path string) (bytes.Buffer, error) {
|
|
var stdout, stderr bytes.Buffer
|
|
alloc, _, err := client.Allocations().Info(allocID, nil)
|
|
if err != nil {
|
|
return stdout, err
|
|
}
|
|
ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancelFn()
|
|
|
|
_, err = client.Allocations().Exec(ctx,
|
|
alloc, "task", false,
|
|
[]string{"cat", path},
|
|
os.Stdin, &stdout, &stderr,
|
|
make(chan api.TerminalSize), nil)
|
|
return stdout, err
|
|
}
|
|
|
|
func waitForPluginStatusMinNodeCount(pluginID string, minCount int, wc *e2e.WaitConfig) error {
|
|
|
|
return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
|
|
expected, err := e2e.GetField(out, "Nodes Expected")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if expectedCount < minCount {
|
|
return false, fmt.Errorf(
|
|
"expected Nodes Expected >= %d, got %q", minCount, expected)
|
|
}
|
|
healthy, err := e2e.GetField(out, "Nodes Healthy")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if healthy != expected {
|
|
return false, fmt.Errorf(
|
|
"expected Nodes Healthy >= %d, got %q", minCount, healthy)
|
|
}
|
|
return true, nil
|
|
}, wc)
|
|
}
|
|
|
|
func waitForPluginStatusControllerCount(pluginID string, count int, wc *e2e.WaitConfig) error {
|
|
|
|
return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
|
|
|
|
expected, err := e2e.GetField(out, "Controllers Expected")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if expectedCount != count {
|
|
return false, fmt.Errorf(
|
|
"expected Controllers Expected = %d, got %d", count, expectedCount)
|
|
}
|
|
healthy, err := e2e.GetField(out, "Controllers Healthy")
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
healthyCount, err := strconv.Atoi(strings.TrimSpace(healthy))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if healthyCount != count {
|
|
return false, fmt.Errorf(
|
|
"expected Controllers Healthy = %d, got %d", count, healthyCount)
|
|
}
|
|
return true, nil
|
|
|
|
}, wc)
|
|
}
|
|
|
|
func waitForPluginStatusCompare(pluginID string, compare func(got string) (bool, error), wc *e2e.WaitConfig) error {
|
|
var err error
|
|
testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
|
|
time.Sleep(wc.Interval)
|
|
out, err := e2e.Command("nomad", "plugin", "status", pluginID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return compare(out)
|
|
}, func(e error) {
|
|
err = fmt.Errorf("plugin status check failed: %v", e)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// volumeRegister creates or registers a volume spec from a file but with a
|
|
// unique ID. The caller is responsible for recording that ID for later
|
|
// cleanup.
|
|
func volumeRegister(volID, volFilePath, createOrRegister string) error {
|
|
|
|
// a CSI RPC to create a volume can take a long time because we
|
|
// have to wait on the AWS API to provision a disk, but a register
|
|
// should not because it only has to check the API for compatibility
|
|
timeout := time.Second * 30
|
|
if createOrRegister == "create" {
|
|
timeout = time.Minute * 2
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "nomad", "volume", createOrRegister, "-")
|
|
stdin, err := cmd.StdinPipe()
|
|
if err != nil {
|
|
return fmt.Errorf("could not open stdin?: %w", err)
|
|
}
|
|
|
|
content, err := ioutil.ReadFile(volFilePath)
|
|
if err != nil {
|
|
return fmt.Errorf("could not open vol file: %w", err)
|
|
}
|
|
|
|
// hack off the first line to replace with our unique ID
|
|
var idRegex = regexp.MustCompile(`(?m)^id[\s]+= ".*"`)
|
|
volspec := idRegex.ReplaceAllString(string(content),
|
|
fmt.Sprintf("id = %q", volID))
|
|
|
|
// the EBS plugin uses the name as an idempotency token across the
|
|
// whole AWS account, so it has to be globally unique
|
|
var nameRegex = regexp.MustCompile(`(?m)^name[\s]+= ".*"`)
|
|
volspec = nameRegex.ReplaceAllString(volspec,
|
|
fmt.Sprintf("name = %q", uuid.Generate()))
|
|
|
|
go func() {
|
|
defer stdin.Close()
|
|
io.WriteString(stdin, volspec)
|
|
}()
|
|
|
|
out, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return fmt.Errorf("could not register vol: %w\n%v", err, string(out))
|
|
}
|
|
return nil
|
|
}
|