2015-09-11 18:10:20 +00:00
|
|
|
package command
|
|
|
|
|
|
|
|
import (
|
2022-08-17 20:22:26 +00:00
|
|
|
"fmt"
|
2021-10-13 21:26:56 +00:00
|
|
|
"os"
|
2023-03-09 20:00:04 +00:00
|
|
|
"regexp"
|
2015-09-11 18:10:20 +00:00
|
|
|
"testing"
|
2022-08-05 13:30:17 +00:00
|
|
|
"time"
|
2015-09-11 18:10:20 +00:00
|
|
|
|
2015-09-12 21:50:05 +00:00
|
|
|
"github.com/hashicorp/nomad/api"
|
2017-07-21 04:07:32 +00:00
|
|
|
"github.com/hashicorp/nomad/command/agent"
|
2022-08-17 16:26:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
2022-08-17 20:22:26 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2021-10-12 20:58:41 +00:00
|
|
|
"github.com/hashicorp/nomad/testutil"
|
2022-08-17 20:22:26 +00:00
|
|
|
"github.com/shoenig/test/must"
|
2015-09-11 18:10:20 +00:00
|
|
|
)
|
|
|
|
|
2023-03-09 20:00:04 +00:00
|
|
|
var nonAlphaNum = regexp.MustCompile(`[^a-zA-Z0-9]+`)
|
|
|
|
|
2017-07-21 04:07:32 +00:00
|
|
|
func testServer(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) {
|
2015-09-12 23:12:56 +00:00
|
|
|
// Make a new test server
|
2017-10-19 04:45:18 +00:00
|
|
|
a := agent.NewTestAgent(t, t.Name(), func(config *agent.Config) {
|
2017-07-21 04:07:32 +00:00
|
|
|
config.Client.Enabled = runClient
|
|
|
|
|
|
|
|
if cb != nil {
|
|
|
|
cb(config)
|
|
|
|
}
|
|
|
|
})
|
2022-12-21 14:23:58 +00:00
|
|
|
t.Cleanup(a.Shutdown)
|
2015-09-12 21:50:05 +00:00
|
|
|
|
2017-07-21 04:07:32 +00:00
|
|
|
c := a.Client()
|
|
|
|
return a, c, a.HTTPAddr()
|
2015-09-11 18:10:20 +00:00
|
|
|
}
|
2015-09-16 18:42:28 +00:00
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
// testClient starts a new test client, blocks until it joins, and performs
|
|
|
|
// cleanup after the test is complete.
|
|
|
|
func testClient(t *testing.T, name string, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) {
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("Starting client agent %s", name)
|
2021-10-12 20:58:41 +00:00
|
|
|
a := agent.NewTestAgent(t, name, func(config *agent.Config) {
|
|
|
|
if cb != nil {
|
|
|
|
cb(config)
|
|
|
|
}
|
|
|
|
})
|
2022-12-21 14:23:58 +00:00
|
|
|
t.Cleanup(a.Shutdown)
|
2021-10-12 20:58:41 +00:00
|
|
|
|
|
|
|
c := a.Client()
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("Waiting for client %s to join server(s) %s", name, a.GetConfig().Client.Servers)
|
2021-10-12 20:58:41 +00:00
|
|
|
testutil.WaitForClient(t, a.Agent.RPC, a.Agent.Client().NodeID(), a.Agent.Client().Region())
|
|
|
|
|
|
|
|
return a, c, a.HTTPAddr()
|
|
|
|
}
|
|
|
|
|
2015-09-16 18:42:28 +00:00
|
|
|
func testJob(jobID string) *api.Job {
|
2016-08-22 16:35:25 +00:00
|
|
|
task := api.NewTask("task1", "mock_driver").
|
|
|
|
SetConfig("kill_after", "1s").
|
|
|
|
SetConfig("run_for", "5s").
|
|
|
|
SetConfig("exit_code", 0).
|
2016-02-02 21:50:30 +00:00
|
|
|
Require(&api.Resources{
|
2022-08-17 16:26:34 +00:00
|
|
|
MemoryMB: pointer.Of(256),
|
|
|
|
CPU: pointer.Of(100),
|
2016-02-19 23:49:32 +00:00
|
|
|
}).
|
2016-02-11 18:42:56 +00:00
|
|
|
SetLogConfig(&api.LogConfig{
|
2022-08-17 16:26:34 +00:00
|
|
|
MaxFiles: pointer.Of(1),
|
|
|
|
MaxFileSizeMB: pointer.Of(2),
|
2016-02-19 23:49:32 +00:00
|
|
|
})
|
2015-09-16 18:42:28 +00:00
|
|
|
|
|
|
|
group := api.NewTaskGroup("group1", 1).
|
2016-08-26 04:05:21 +00:00
|
|
|
AddTask(task).
|
2016-09-14 22:43:42 +00:00
|
|
|
RequireDisk(&api.EphemeralDisk{
|
2022-08-17 16:26:34 +00:00
|
|
|
SizeMB: pointer.Of(20),
|
2016-08-26 04:05:21 +00:00
|
|
|
})
|
2015-09-16 18:42:28 +00:00
|
|
|
|
2019-05-02 20:00:21 +00:00
|
|
|
job := api.NewBatchJob(jobID, jobID, "global", 1).
|
2015-09-16 18:42:28 +00:00
|
|
|
AddDatacenter("dc1").
|
|
|
|
AddTaskGroup(group)
|
|
|
|
|
|
|
|
return job
|
|
|
|
}
|
2020-06-15 14:05:31 +00:00
|
|
|
|
2022-08-05 13:30:17 +00:00
|
|
|
func testNomadServiceJob(jobID string) *api.Job {
|
|
|
|
j := testJob(jobID)
|
|
|
|
j.TaskGroups[0].Services = []*api.Service{{
|
|
|
|
Name: "service1",
|
|
|
|
PortLabel: "1000",
|
|
|
|
AddressMode: "",
|
|
|
|
Address: "127.0.0.1",
|
|
|
|
Checks: []api.ServiceCheck{{
|
|
|
|
Name: "check1",
|
|
|
|
Type: "http",
|
|
|
|
Path: "/",
|
|
|
|
Interval: 1 * time.Second,
|
|
|
|
Timeout: 1 * time.Second,
|
|
|
|
}},
|
|
|
|
Provider: "nomad",
|
|
|
|
}}
|
|
|
|
return j
|
|
|
|
}
|
|
|
|
|
2020-06-15 14:05:31 +00:00
|
|
|
func testMultiRegionJob(jobID, region, datacenter string) *api.Job {
|
|
|
|
task := api.NewTask("task1", "mock_driver").
|
|
|
|
SetConfig("kill_after", "10s").
|
|
|
|
SetConfig("run_for", "15s").
|
|
|
|
SetConfig("exit_code", 0).
|
|
|
|
Require(&api.Resources{
|
2022-08-17 16:26:34 +00:00
|
|
|
MemoryMB: pointer.Of(256),
|
|
|
|
CPU: pointer.Of(100),
|
2020-06-15 14:05:31 +00:00
|
|
|
}).
|
|
|
|
SetLogConfig(&api.LogConfig{
|
2022-08-17 16:26:34 +00:00
|
|
|
MaxFiles: pointer.Of(1),
|
|
|
|
MaxFileSizeMB: pointer.Of(2),
|
2020-06-15 14:05:31 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
group := api.NewTaskGroup("group1", 1).
|
|
|
|
AddTask(task).
|
|
|
|
RequireDisk(&api.EphemeralDisk{
|
2022-08-17 16:26:34 +00:00
|
|
|
SizeMB: pointer.Of(20),
|
2020-06-15 14:05:31 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
job := api.NewServiceJob(jobID, jobID, region, 1).AddDatacenter(datacenter).AddTaskGroup(group)
|
2020-06-15 20:18:14 +00:00
|
|
|
job.Region = nil
|
2020-06-15 14:05:31 +00:00
|
|
|
job.Multiregion = &api.Multiregion{
|
|
|
|
Regions: []*api.MultiregionRegion{
|
|
|
|
{
|
|
|
|
Name: "east",
|
|
|
|
Datacenters: []string{"east-1"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Name: "west",
|
|
|
|
Datacenters: []string{"west-1"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
return job
|
|
|
|
}
|
2021-10-13 21:26:56 +00:00
|
|
|
|
2022-08-17 20:22:26 +00:00
|
|
|
func waitForNodes(t *testing.T, client *api.Client) {
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
nodes, _, err := client.Nodes().List(nil)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
|
|
if _, ok := node.Drivers["mock_driver"]; ok &&
|
|
|
|
node.Status == structs.NodeStatusReady {
|
|
|
|
return true, nil
|
|
|
|
}
|
2021-10-13 21:26:56 +00:00
|
|
|
}
|
2022-08-17 20:22:26 +00:00
|
|
|
return false, fmt.Errorf("no ready nodes")
|
|
|
|
}, func(err error) {
|
|
|
|
must.NoError(t, err)
|
2021-10-13 21:26:56 +00:00
|
|
|
})
|
|
|
|
}
|
2022-08-17 20:22:26 +00:00
|
|
|
|
2023-03-23 22:28:26 +00:00
|
|
|
func waitForJobAllocsStatus(t *testing.T, client *api.Client, jobID string, status string, token string) {
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
q := &api.QueryOptions{AuthToken: token}
|
|
|
|
|
|
|
|
allocs, _, err := client.Jobs().Allocations(jobID, true, q)
|
|
|
|
if err != nil {
|
|
|
|
return false, fmt.Errorf("failed to query job allocs: %v", err)
|
|
|
|
}
|
|
|
|
if len(allocs) == 0 {
|
|
|
|
return false, fmt.Errorf("no allocs")
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
if alloc.ClientStatus != status {
|
|
|
|
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
must.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func waitForAllocStatus(t *testing.T, client *api.Client, allocID string, status string) {
|
2022-08-18 13:51:53 +00:00
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
alloc, _, err := client.Allocations().Info(allocID, nil)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2023-03-23 22:28:26 +00:00
|
|
|
if alloc.ClientStatus == status {
|
2022-08-18 13:51:53 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
2023-03-23 22:28:26 +00:00
|
|
|
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
|
2022-08-18 13:51:53 +00:00
|
|
|
}, func(err error) {
|
2023-03-23 22:28:26 +00:00
|
|
|
must.NoError(t, err)
|
2022-08-18 13:51:53 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2023-03-23 22:28:26 +00:00
|
|
|
func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
|
|
|
|
waitForAllocStatus(t, client, allocID, api.AllocClientStatusRunning)
|
|
|
|
}
|
|
|
|
|
2022-08-24 20:11:41 +00:00
|
|
|
func waitForCheckStatus(t *testing.T, client *api.Client, allocID, status string) {
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
results, err := client.Allocations().Checks(allocID, nil)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// pick a check, any check will do
|
|
|
|
for _, check := range results {
|
|
|
|
if check.Status == status {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, fmt.Errorf("no check with status: %s", status)
|
|
|
|
}, func(err error) {
|
|
|
|
t.Fatalf("timed out waiting for alloc to be running: %v", err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2022-08-05 13:30:17 +00:00
|
|
|
func getAllocFromJob(t *testing.T, client *api.Client, jobID string) string {
|
|
|
|
var allocID string
|
|
|
|
if allocations, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil {
|
|
|
|
if len(allocations) > 0 {
|
|
|
|
allocID = allocations[0].ID
|
|
|
|
}
|
|
|
|
}
|
|
|
|
must.NotEq(t, "", allocID, must.Sprint("expected to find an evaluation after running job", jobID))
|
|
|
|
return allocID
|
|
|
|
}
|
|
|
|
|
2022-08-17 20:22:26 +00:00
|
|
|
func getTempFile(t *testing.T, name string) (string, func()) {
|
|
|
|
f, err := os.CreateTemp("", name)
|
|
|
|
must.NoError(t, err)
|
|
|
|
must.NoError(t, f.Close())
|
|
|
|
return f.Name(), func() {
|
|
|
|
_ = os.Remove(f.Name())
|
|
|
|
}
|
|
|
|
}
|