2020-06-25 16:51:23 +00:00
|
|
|
package command
|
|
|
|
|
|
|
|
import (
|
2020-12-01 17:36:05 +00:00
|
|
|
"fmt"
|
2021-11-05 23:43:10 +00:00
|
|
|
"io/ioutil"
|
2022-01-17 16:15:17 +00:00
|
|
|
"net/http"
|
|
|
|
"net/http/httptest"
|
2020-06-25 16:51:23 +00:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2022-01-18 02:35:51 +00:00
|
|
|
"regexp"
|
2022-01-17 16:15:17 +00:00
|
|
|
"strings"
|
2020-06-25 16:51:23 +00:00
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
2021-11-05 23:43:10 +00:00
|
|
|
consulapi "github.com/hashicorp/consul/api"
|
|
|
|
consultest "github.com/hashicorp/consul/sdk/testutil"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
2022-03-15 12:42:43 +00:00
|
|
|
"github.com/hashicorp/nomad/ci"
|
2021-11-05 23:43:10 +00:00
|
|
|
clienttest "github.com/hashicorp/nomad/client/testutil"
|
2020-11-12 16:25:28 +00:00
|
|
|
"github.com/hashicorp/nomad/command/agent"
|
2020-12-01 17:36:05 +00:00
|
|
|
"github.com/hashicorp/nomad/helper"
|
2022-08-17 16:26:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
2020-12-01 17:36:05 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
2020-10-14 19:16:10 +00:00
|
|
|
"github.com/hashicorp/nomad/testutil"
|
2020-06-25 16:51:23 +00:00
|
|
|
"github.com/mitchellh/cli"
|
2020-10-14 19:16:10 +00:00
|
|
|
"github.com/stretchr/testify/assert"
|
2020-06-25 16:51:23 +00:00
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
)
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// NOTE: most of these tests cannot be run in parallel
|
|
|
|
|
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
args []string
|
|
|
|
expectedCode int
|
|
|
|
expectedOutputs []string
|
|
|
|
expectedError string
|
2020-12-01 17:36:05 +00:00
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
type testCases []testCase
|
2020-08-11 17:14:28 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
func runTestCases(t *testing.T, cases testCases) {
|
|
|
|
t.Helper()
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
2020-08-11 17:14:28 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
code := cmd.Run(c.args)
|
|
|
|
out := ui.OutputWriter.String()
|
|
|
|
outerr := ui.ErrorWriter.String()
|
2020-08-11 17:14:28 +00:00
|
|
|
|
2023-01-03 14:21:20 +00:00
|
|
|
assert.Equalf(t, c.expectedCode, code, "did not get expected exit code")
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
|
|
|
|
if len(c.expectedOutputs) > 0 {
|
|
|
|
if assert.NotEmpty(t, out, "command output was empty") {
|
|
|
|
for _, expectedOutput := range c.expectedOutputs {
|
|
|
|
assert.Contains(t, out, expectedOutput, "did not get expected output")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert.Empty(t, out, "command output should have been empty")
|
|
|
|
}
|
|
|
|
|
|
|
|
if c.expectedError == "" {
|
|
|
|
assert.Empty(t, outerr, "got unexpected error")
|
|
|
|
} else {
|
|
|
|
assert.Containsf(t, outerr, c.expectedError, "did not get expected error")
|
2020-12-15 18:51:41 +00:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2020-06-25 16:51:23 +00:00
|
|
|
}
|
2021-10-12 20:58:41 +00:00
|
|
|
func newClientAgentConfigFunc(region string, nodeClass string, srvRPCAddr string) func(*agent.Config) {
|
|
|
|
if region == "" {
|
|
|
|
region = "global"
|
|
|
|
}
|
|
|
|
|
|
|
|
return func(c *agent.Config) {
|
|
|
|
c.Region = region
|
|
|
|
c.Client.NodeClass = nodeClass
|
|
|
|
c.Client.Servers = []string{srvRPCAddr}
|
|
|
|
c.Client.Enabled = true
|
|
|
|
c.Server.Enabled = false
|
|
|
|
}
|
|
|
|
}
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-11-12 16:25:28 +00:00
|
|
|
func TestDebug_NodeClass(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-11-12 16:25:28 +00:00
|
|
|
// Start test server and API client
|
|
|
|
srv, _, url := testServer(t, false, nil)
|
|
|
|
|
|
|
|
// Wait for leadership to establish
|
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
|
|
|
|
|
|
|
// Retrieve server RPC address to join clients
|
|
|
|
srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
|
2020-11-12 16:25:28 +00:00
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
// Start test clients
|
|
|
|
testClient(t, "client1", newClientAgentConfigFunc("global", "classA", srvRPCAddr))
|
|
|
|
testClient(t, "client2", newClientAgentConfigFunc("global", "classB", srvRPCAddr))
|
|
|
|
testClient(t, "client3", newClientAgentConfigFunc("global", "classA", srvRPCAddr))
|
2020-11-12 16:25:28 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Setup test cases
|
|
|
|
cases := testCases{
|
2020-12-07 22:34:40 +00:00
|
|
|
{
|
2021-10-12 20:58:41 +00:00
|
|
|
name: "address=api, node-class=classA, max-nodes=2",
|
|
|
|
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classA", "-max-nodes", "2"},
|
2020-12-07 22:34:40 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
2020-12-14 20:02:48 +00:00
|
|
|
"Servers: (1/1)",
|
|
|
|
"Clients: (2/3)",
|
2020-12-07 22:34:40 +00:00
|
|
|
"Max node count reached (2)",
|
2021-10-12 20:58:41 +00:00
|
|
|
"Node Class: classA",
|
2020-12-14 20:02:48 +00:00
|
|
|
"Created debug archive",
|
2020-12-07 22:34:40 +00:00
|
|
|
},
|
|
|
|
expectedError: "",
|
|
|
|
},
|
|
|
|
{
|
2021-10-12 20:58:41 +00:00
|
|
|
name: "address=api, node-class=classB, max-nodes=2",
|
|
|
|
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classB", "-max-nodes", "2"},
|
2020-12-07 22:34:40 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
2020-12-14 20:02:48 +00:00
|
|
|
"Servers: (1/1)",
|
|
|
|
"Clients: (1/3)",
|
2021-10-12 20:58:41 +00:00
|
|
|
"Node Class: classB",
|
2020-12-14 20:02:48 +00:00
|
|
|
"Created debug archive",
|
2020-12-07 22:34:40 +00:00
|
|
|
},
|
|
|
|
expectedError: "",
|
|
|
|
},
|
|
|
|
}
|
2020-12-01 17:36:05 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
runTestCases(t, cases)
|
2020-11-12 16:25:28 +00:00
|
|
|
}
|
|
|
|
|
2020-12-07 22:34:40 +00:00
|
|
|
func TestDebug_ClientToServer(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-12-07 22:34:40 +00:00
|
|
|
// Start test server and API client
|
|
|
|
srv, _, url := testServer(t, false, nil)
|
|
|
|
|
|
|
|
// Wait for leadership to establish
|
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
|
|
|
|
|
|
|
// Retrieve server RPC address to join client
|
|
|
|
srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
|
2020-12-07 22:34:40 +00:00
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
// Start client
|
|
|
|
agent1, _, _ := testClient(t, "client1", newClientAgentConfigFunc("", "", srvRPCAddr))
|
2020-12-07 22:34:40 +00:00
|
|
|
|
|
|
|
// Get API addresses
|
|
|
|
addrServer := srv.HTTPAddr()
|
2021-10-12 20:58:41 +00:00
|
|
|
addrClient1 := agent1.HTTPAddr()
|
2020-12-07 22:34:40 +00:00
|
|
|
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("testAgent api address: %s", url)
|
|
|
|
t.Logf("Server api address: %s", addrServer)
|
|
|
|
t.Logf("Client1 api address: %s", addrClient1)
|
2020-12-07 22:34:40 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Setup test cases
|
|
|
|
var cases = testCases{
|
2020-12-07 22:34:40 +00:00
|
|
|
{
|
2020-12-15 18:51:41 +00:00
|
|
|
name: "testAgent api server",
|
2021-03-09 13:31:38 +00:00
|
|
|
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
|
2020-12-15 18:51:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{"Created debug archive"},
|
2020-12-07 22:34:40 +00:00
|
|
|
},
|
|
|
|
{
|
2020-12-15 18:51:41 +00:00
|
|
|
name: "server address",
|
2021-03-09 13:31:38 +00:00
|
|
|
args: []string{"-address", addrServer, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
|
2020-12-15 18:51:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{"Created debug archive"},
|
2020-12-07 22:34:40 +00:00
|
|
|
},
|
|
|
|
{
|
2020-12-15 18:51:41 +00:00
|
|
|
name: "client1 address - verify no SIGSEGV panic",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
|
2020-12-15 18:51:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{"Created debug archive"},
|
2020-12-07 22:34:40 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
runTestCases(t, cases)
|
|
|
|
}
|
2020-12-14 20:02:48 +00:00
|
|
|
|
2021-11-05 23:43:10 +00:00
|
|
|
func TestDebug_MultiRegion(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
|
2021-11-05 23:43:10 +00:00
|
|
|
region1 := "region1"
|
|
|
|
region2 := "region2"
|
|
|
|
|
|
|
|
// Start region1 server
|
|
|
|
server1, _, addrServer1 := testServer(t, false, func(c *agent.Config) { c.Region = region1 })
|
|
|
|
testutil.WaitForLeader(t, server1.Agent.RPC)
|
|
|
|
rpcAddrServer1 := server1.GetConfig().AdvertiseAddrs.RPC
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region1, addrServer1, rpcAddrServer1)
|
2021-11-05 23:43:10 +00:00
|
|
|
|
|
|
|
// Start region1 client
|
|
|
|
agent1, _, addrClient1 := testClient(t, "client1", newClientAgentConfigFunc(region1, "", rpcAddrServer1))
|
|
|
|
nodeIdClient1 := agent1.Agent.Client().NodeID()
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region1, nodeIdClient1, addrClient1)
|
2021-11-05 23:43:10 +00:00
|
|
|
|
|
|
|
// Start region2 server
|
|
|
|
server2, _, addrServer2 := testServer(t, false, func(c *agent.Config) { c.Region = region2 })
|
|
|
|
testutil.WaitForLeader(t, server2.Agent.RPC)
|
|
|
|
rpcAddrServer2 := server2.GetConfig().AdvertiseAddrs.RPC
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region2, addrServer2, rpcAddrServer2)
|
2021-11-05 23:43:10 +00:00
|
|
|
|
|
|
|
// Start client2
|
|
|
|
agent2, _, addrClient2 := testClient(t, "client2", newClientAgentConfigFunc(region2, "", rpcAddrServer2))
|
|
|
|
nodeIdClient2 := agent2.Agent.Client().NodeID()
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region2, nodeIdClient2, addrClient2)
|
2021-11-05 23:43:10 +00:00
|
|
|
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("Region: %s, Server1 api address: %s", region1, addrServer1)
|
|
|
|
t.Logf("Region: %s, Client1 api address: %s", region1, addrClient1)
|
|
|
|
t.Logf("Region: %s, Server2 api address: %s", region2, addrServer2)
|
|
|
|
t.Logf("Region: %s, Client2 api address: %s", region2, addrClient2)
|
2021-10-12 20:58:41 +00:00
|
|
|
|
|
|
|
// Setup test cases
|
|
|
|
var cases = testCases{
|
|
|
|
// Good
|
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "no region - all servers, all clients",
|
|
|
|
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
|
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{"Starting debugger"},
|
2021-11-05 23:43:10 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "region1 - server1 address",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
|
2021-10-12 20:58:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
2021-11-05 23:43:10 +00:00
|
|
|
"Region: " + region1 + "\n",
|
|
|
|
"Servers: (1/1) [TestDebug_MultiRegion.region1]",
|
|
|
|
"Clients: (1/1) [" + nodeIdClient1 + "]",
|
2021-10-12 20:58:41 +00:00
|
|
|
"Created debug archive",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2021-11-05 23:43:10 +00:00
|
|
|
name: "region1 - client1 address",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
|
2021-10-12 20:58:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
2021-11-05 23:43:10 +00:00
|
|
|
"Region: " + region1 + "\n",
|
|
|
|
"Servers: (1/1) [TestDebug_MultiRegion.region1]",
|
|
|
|
"Clients: (1/1) [" + nodeIdClient1 + "]",
|
2021-10-12 20:58:41 +00:00
|
|
|
"Created debug archive",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2021-11-05 23:43:10 +00:00
|
|
|
name: "region2 - server2 address",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
|
2021-10-12 20:58:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
2021-11-05 23:43:10 +00:00
|
|
|
"Region: " + region2 + "\n",
|
|
|
|
"Servers: (1/1) [TestDebug_MultiRegion.region2]",
|
|
|
|
"Clients: (1/1) [" + nodeIdClient2 + "]",
|
|
|
|
"Created debug archive",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "region2 - client2 address",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
|
2021-11-05 23:43:10 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
|
|
|
"Region: " + region2 + "\n",
|
|
|
|
"Servers: (1/1) [TestDebug_MultiRegion.region2]",
|
|
|
|
"Clients: (1/1) [" + nodeIdClient2 + "]",
|
2021-10-12 20:58:41 +00:00
|
|
|
"Created debug archive",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
|
|
|
|
// Bad
|
|
|
|
{
|
|
|
|
name: "invalid region - all servers, all clients",
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
|
2021-10-12 20:58:41 +00:00
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "500 (No path to region)",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
runTestCases(t, cases)
|
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
func TestDebug_SingleServer(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
srv, _, url := testServer(t, false, nil)
|
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
2020-12-07 22:34:40 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
var cases = testCases{
|
|
|
|
{
|
|
|
|
name: "address=api, server-id=leader",
|
2021-03-09 13:31:38 +00:00
|
|
|
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader"},
|
2020-12-15 18:51:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
|
|
|
"Servers: (1/1)",
|
|
|
|
"Clients: (0/0)",
|
|
|
|
"Created debug archive",
|
|
|
|
},
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
expectedError: "No node(s) with prefix",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "address=api, server-id=all",
|
2021-03-09 13:31:38 +00:00
|
|
|
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"},
|
2020-12-15 18:51:41 +00:00
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{
|
|
|
|
"Servers: (1/1)",
|
|
|
|
"Clients: (0/0)",
|
|
|
|
"Created debug archive",
|
|
|
|
},
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
expectedError: "No node(s) with prefix",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
2020-12-07 22:34:40 +00:00
|
|
|
}
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
runTestCases(t, cases)
|
2020-12-07 22:34:40 +00:00
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
func TestDebug_Failures(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-10-14 19:16:10 +00:00
|
|
|
srv, _, url := testServer(t, false, nil)
|
2020-11-12 16:25:28 +00:00
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
2020-10-06 02:30:01 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
var cases = testCases{
|
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "fails incorrect args",
|
|
|
|
args: []string{"some", "bad", "args"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "This command takes no arguments",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails illegal node ids",
|
|
|
|
args: []string{"-node-id", "foo:bar"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error querying node info",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails missing node ids",
|
|
|
|
args: []string{"-node-id", "abc,def", "-duration", "250ms", "-interval", "250ms"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error querying node info",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails bad durations",
|
|
|
|
args: []string{"-duration", "foo"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error parsing duration: foo: time: invalid duration \"foo\""},
|
2020-12-15 18:51:41 +00:00
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails bad intervals",
|
|
|
|
args: []string{"-interval", "bar"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error parsing interval: bar: time: invalid duration \"bar\"",
|
2020-12-15 18:51:41 +00:00
|
|
|
},
|
2021-03-09 13:31:38 +00:00
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails intervals greater than duration",
|
|
|
|
args: []string{"-duration", "5m", "-interval", "10m"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error parsing interval: 10m is greater than duration 5m",
|
2021-03-09 13:31:38 +00:00
|
|
|
},
|
2022-01-18 02:35:51 +00:00
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails bad pprof duration",
|
|
|
|
args: []string{"-pprof-duration", "baz"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error parsing pprof duration: baz: time: invalid duration \"baz\"",
|
2022-01-18 02:35:51 +00:00
|
|
|
},
|
2022-04-04 19:24:12 +00:00
|
|
|
{
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
name: "Fails bad pprof interval",
|
|
|
|
args: []string{"-pprof-interval", "bar"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "Error parsing pprof-interval: bar: time: invalid duration \"bar\"",
|
2022-04-04 19:24:12 +00:00
|
|
|
},
|
2020-12-15 18:51:41 +00:00
|
|
|
{
|
|
|
|
name: "Fails bad address",
|
|
|
|
args: []string{"-address", url + "bogus"},
|
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "invalid address",
|
|
|
|
},
|
|
|
|
}
|
2020-10-06 02:30:01 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
runTestCases(t, cases)
|
2020-10-06 02:30:01 +00:00
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Start test server and API client
|
2020-10-14 19:16:10 +00:00
|
|
|
srv, _, url := testServer(t, false, nil)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
// Wait for leadership to establish
|
2020-11-12 16:25:28 +00:00
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
cases := []string{
|
|
|
|
"aws/ebs",
|
|
|
|
"gcp-*-1",
|
|
|
|
}
|
|
|
|
for _, pluginName := range cases {
|
|
|
|
cleanup := state.CreateTestCSIPlugin(srv.Agent.Server().State(), pluginName)
|
|
|
|
defer cleanup()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup mock UI
|
2020-10-06 13:47:16 +00:00
|
|
|
ui := cli.NewMockUi()
|
2020-08-11 19:39:44 +00:00
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir := t.TempDir()
|
|
|
|
defer os.Remove(testDir)
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Debug on the leader and all client nodes
|
2022-04-04 19:24:12 +00:00
|
|
|
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
|
2020-12-15 18:51:41 +00:00
|
|
|
assert.Equal(t, 0, code)
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Bad plugin name should be escaped before it reaches the sandbox test
|
|
|
|
require.NotContains(t, ui.ErrorWriter.String(), "file path escapes capture directory")
|
|
|
|
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
path := cmd.collectDir
|
2020-10-14 19:16:10 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
var pluginFiles []string
|
|
|
|
for _, pluginName := range cases {
|
|
|
|
pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
|
2021-10-13 22:00:55 +00:00
|
|
|
pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
|
2020-12-15 18:51:41 +00:00
|
|
|
pluginFiles = append(pluginFiles, pluginFile)
|
|
|
|
}
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
testutil.WaitForFiles(t, pluginFiles)
|
|
|
|
}
|
2020-11-12 16:25:28 +00:00
|
|
|
|
2021-10-13 22:00:55 +00:00
|
|
|
func buildPathSlice(path string, files []string) []string {
|
|
|
|
paths := []string{}
|
|
|
|
for _, file := range files {
|
|
|
|
paths = append(paths, filepath.Join(path, file))
|
|
|
|
}
|
|
|
|
return paths
|
|
|
|
}
|
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
func TestDebug_CapturedFiles(t *testing.T) {
|
2021-10-13 22:00:55 +00:00
|
|
|
srv, _, url := testServer(t, true, nil)
|
2020-11-12 16:25:28 +00:00
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2021-10-13 22:00:55 +00:00
|
|
|
serverNodeName := srv.Config.NodeName
|
|
|
|
region := srv.Config.Region
|
|
|
|
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
|
|
|
|
clientID := srv.Agent.Client().NodeID()
|
2022-03-30 02:07:39 +00:00
|
|
|
testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region())
|
2021-10-13 22:00:55 +00:00
|
|
|
|
|
|
|
t.Logf("serverName: %s, clientID, %s", serverName, clientID)
|
|
|
|
|
|
|
|
// Setup file slices
|
|
|
|
clusterFiles := []string{
|
|
|
|
"agent-self.json",
|
|
|
|
"members.json",
|
|
|
|
"namespaces.json",
|
|
|
|
"regions.json",
|
|
|
|
}
|
|
|
|
|
|
|
|
pprofFiles := []string{
|
|
|
|
"allocs.prof",
|
|
|
|
"goroutine-debug1.txt",
|
|
|
|
"goroutine-debug2.txt",
|
|
|
|
"goroutine.prof",
|
|
|
|
"heap.prof",
|
2022-04-04 19:24:12 +00:00
|
|
|
"profile_0000.prof",
|
2021-10-13 22:00:55 +00:00
|
|
|
"threadcreate.prof",
|
|
|
|
"trace.prof",
|
|
|
|
}
|
|
|
|
|
|
|
|
clientFiles := []string{
|
|
|
|
"agent-host.json",
|
|
|
|
"monitor.log",
|
|
|
|
}
|
|
|
|
clientFiles = append(clientFiles, pprofFiles...)
|
|
|
|
|
|
|
|
serverFiles := []string{
|
|
|
|
"agent-host.json",
|
|
|
|
"monitor.log",
|
|
|
|
}
|
|
|
|
serverFiles = append(serverFiles, pprofFiles...)
|
|
|
|
|
|
|
|
intervalFiles := []string{
|
|
|
|
"allocations.json",
|
|
|
|
"csi-plugins.json",
|
|
|
|
"csi-volumes.json",
|
|
|
|
"deployments.json",
|
|
|
|
"evaluations.json",
|
|
|
|
"jobs.json",
|
|
|
|
"license.json",
|
|
|
|
"metrics.json",
|
|
|
|
"nodes.json",
|
|
|
|
"operator-autopilot-health.json",
|
|
|
|
"operator-raft.json",
|
|
|
|
"operator-scheduler.json",
|
|
|
|
}
|
|
|
|
|
2020-10-06 13:47:16 +00:00
|
|
|
ui := cli.NewMockUi()
|
2020-08-11 19:39:44 +00:00
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir := t.TempDir()
|
|
|
|
defer os.Remove(testDir)
|
2020-06-25 16:51:23 +00:00
|
|
|
|
2021-11-05 23:43:10 +00:00
|
|
|
duration := 2 * time.Second
|
|
|
|
interval := 750 * time.Millisecond
|
|
|
|
waitTime := 2 * duration
|
|
|
|
|
2020-06-25 16:51:23 +00:00
|
|
|
code := cmd.Run([]string{
|
|
|
|
"-address", url,
|
2022-04-04 19:24:12 +00:00
|
|
|
"-output", testDir,
|
2021-10-13 22:00:55 +00:00
|
|
|
"-server-id", serverName,
|
|
|
|
"-node-id", clientID,
|
2021-11-05 23:43:10 +00:00
|
|
|
"-duration", duration.String(),
|
|
|
|
"-interval", interval.String(),
|
2020-06-25 16:51:23 +00:00
|
|
|
})
|
|
|
|
|
2021-10-13 22:00:55 +00:00
|
|
|
// There should be no errors
|
2020-06-25 16:51:23 +00:00
|
|
|
require.Empty(t, ui.ErrorWriter.String())
|
|
|
|
require.Equal(t, 0, code)
|
|
|
|
ui.ErrorWriter.Reset()
|
|
|
|
|
2021-10-13 22:00:55 +00:00
|
|
|
// Verify cluster files
|
|
|
|
clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
|
|
|
|
t.Logf("Waiting for cluster files in path: %s", clusterDir)
|
2021-11-05 23:43:10 +00:00
|
|
|
testutil.WaitForFilesUntil(t, clusterPaths, waitTime)
|
2021-10-13 22:00:55 +00:00
|
|
|
|
|
|
|
// Verify client files
|
|
|
|
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
|
|
|
|
t.Logf("Waiting for client files in path: %s", clientDir)
|
2021-11-05 23:43:10 +00:00
|
|
|
testutil.WaitForFilesUntil(t, clientPaths, waitTime)
|
2021-10-13 22:00:55 +00:00
|
|
|
|
|
|
|
// Verify server files
|
|
|
|
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
|
|
|
|
t.Logf("Waiting for server files in path: %s", serverDir)
|
2021-11-05 23:43:10 +00:00
|
|
|
testutil.WaitForFilesUntil(t, serverPaths, waitTime)
|
2021-10-13 22:00:55 +00:00
|
|
|
|
|
|
|
// Verify interval 0000 files
|
|
|
|
intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
|
|
|
|
t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
|
2021-11-05 23:43:10 +00:00
|
|
|
testutil.WaitForFilesUntil(t, intervalPaths0, waitTime)
|
2021-10-13 22:00:55 +00:00
|
|
|
|
|
|
|
// Verify interval 0001 files
|
|
|
|
intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
|
|
|
|
t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
|
2021-11-05 23:43:10 +00:00
|
|
|
testutil.WaitForFilesUntil(t, intervalPaths1, waitTime)
|
2020-06-25 16:51:23 +00:00
|
|
|
}
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
func TestDebug_ExistingOutput(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
|
|
|
// Fails existing output
|
|
|
|
format := "2006-01-02-150405Z"
|
|
|
|
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
|
2022-04-04 19:24:12 +00:00
|
|
|
tempDir := t.TempDir()
|
|
|
|
path := filepath.Join(tempDir, stamped)
|
2020-12-15 18:51:41 +00:00
|
|
|
os.MkdirAll(path, 0755)
|
2022-04-04 19:24:12 +00:00
|
|
|
defer os.Remove(tempDir)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2022-04-04 19:24:12 +00:00
|
|
|
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
|
2020-12-15 18:51:41 +00:00
|
|
|
require.Equal(t, 2, code)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_Fail_Pprof(t *testing.T) {
|
2022-01-18 02:35:51 +00:00
|
|
|
|
2020-12-15 18:51:41 +00:00
|
|
|
// Setup agent config with debug endpoints disabled
|
|
|
|
agentConfFunc := func(c *agent.Config) {
|
|
|
|
c.EnableDebug = false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start test server and API client
|
|
|
|
srv, _, url := testServer(t, false, agentConfFunc)
|
|
|
|
|
|
|
|
// Wait for leadership to establish
|
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
// Debug on server with endpoints disabled
|
2021-03-09 13:31:38 +00:00
|
|
|
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"})
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
assert.Equal(t, 0, code) // Pprof failure isn't fatal
|
|
|
|
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
|
|
|
|
require.Contains(t, ui.ErrorWriter.String(), "Failed to retrieve pprof") // Should report pprof failure
|
|
|
|
require.Contains(t, ui.ErrorWriter.String(), "Permission denied") // Specifically permission denied
|
|
|
|
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
|
|
|
|
}
|
|
|
|
|
2022-04-28 17:18:55 +00:00
|
|
|
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
|
|
|
|
// filtered by the version constraint.
|
|
|
|
func TestDebug_PprofVersionCheck(t *testing.T) {
|
|
|
|
cases := []struct {
|
|
|
|
version string
|
|
|
|
errMsg string
|
|
|
|
}{
|
|
|
|
{"0.8.7", ""},
|
|
|
|
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
|
|
|
|
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
|
|
|
|
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
|
|
|
|
{"0.11.3", ""},
|
|
|
|
{"0.11.3+ent", ""},
|
|
|
|
{"0.12.0", ""},
|
|
|
|
{"1.3.0", ""},
|
|
|
|
{"foo.bar", "error: Malformed version: foo.bar"},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range cases {
|
|
|
|
t.Run(tc.version, func(t *testing.T) {
|
|
|
|
err := checkVersion(tc.version, minimumVersionPprofConstraint)
|
|
|
|
if tc.errMsg == "" {
|
|
|
|
require.NoError(t, err, "expected no error from %s", tc.version)
|
|
|
|
} else {
|
|
|
|
require.EqualError(t, err, tc.errMsg)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
func TestDebug_StringToSlice(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
cases := []struct {
|
|
|
|
input string
|
|
|
|
expected []string
|
|
|
|
}{
|
|
|
|
{input: ",,", expected: []string(nil)},
|
|
|
|
{input: "", expected: []string(nil)},
|
|
|
|
{input: "foo, bar", expected: []string{"foo", "bar"}},
|
|
|
|
{input: " foo, bar ", expected: []string{"foo", "bar"}},
|
|
|
|
{input: "foo,,bar", expected: []string{"foo", "bar"}},
|
|
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
|
|
out := stringToSlice(tc.input)
|
|
|
|
require.Equal(t, tc.expected, out)
|
|
|
|
}
|
|
|
|
}
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2021-10-12 20:58:41 +00:00
|
|
|
func TestDebug_External(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
// address calculation honors CONSUL_HTTP_SSL
|
2021-02-25 13:22:44 +00:00
|
|
|
// ssl: true - Correct alignment
|
|
|
|
e := &external{addrVal: "https://127.0.0.1:8500", ssl: true}
|
|
|
|
addr := e.addr("foo")
|
|
|
|
require.Equal(t, "https://127.0.0.1:8500", addr)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2021-02-25 13:22:44 +00:00
|
|
|
// ssl: true - protocol incorrect
|
2021-11-05 23:43:10 +00:00
|
|
|
// NOTE: Address with protocol now overrides ssl flag
|
2021-02-25 13:22:44 +00:00
|
|
|
e = &external{addrVal: "http://127.0.0.1:8500", ssl: true}
|
|
|
|
addr = e.addr("foo")
|
2021-11-05 23:43:10 +00:00
|
|
|
require.Equal(t, "http://127.0.0.1:8500", addr)
|
2021-02-25 13:22:44 +00:00
|
|
|
|
|
|
|
// ssl: true - protocol missing
|
|
|
|
e = &external{addrVal: "127.0.0.1:8500", ssl: true}
|
|
|
|
addr = e.addr("foo")
|
|
|
|
require.Equal(t, "https://127.0.0.1:8500", addr)
|
|
|
|
|
|
|
|
// ssl: false - correct alignment
|
2020-12-15 18:51:41 +00:00
|
|
|
e = &external{addrVal: "http://127.0.0.1:8500", ssl: false}
|
2021-02-25 13:22:44 +00:00
|
|
|
addr = e.addr("foo")
|
|
|
|
require.Equal(t, "http://127.0.0.1:8500", addr)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2021-02-25 13:22:44 +00:00
|
|
|
// ssl: false - protocol incorrect
|
2021-11-05 23:43:10 +00:00
|
|
|
// NOTE: Address with protocol now overrides ssl flag
|
2021-02-25 13:22:44 +00:00
|
|
|
e = &external{addrVal: "https://127.0.0.1:8500", ssl: false}
|
|
|
|
addr = e.addr("foo")
|
2021-11-05 23:43:10 +00:00
|
|
|
require.Equal(t, "https://127.0.0.1:8500", addr)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
2021-02-25 13:22:44 +00:00
|
|
|
// ssl: false - protocol missing
|
|
|
|
e = &external{addrVal: "127.0.0.1:8500", ssl: false}
|
|
|
|
addr = e.addr("foo")
|
|
|
|
require.Equal(t, "http://127.0.0.1:8500", addr)
|
2021-11-05 23:43:10 +00:00
|
|
|
|
|
|
|
// Address through proxy might not have a port
|
|
|
|
e = &external{addrVal: "https://127.0.0.1", ssl: true}
|
|
|
|
addr = e.addr("foo")
|
|
|
|
require.Equal(t, "https://127.0.0.1", addr)
|
2020-12-15 18:51:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_WriteBytes_Nil(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
var testDir, testFile, testPath string
|
|
|
|
var testBytes []byte
|
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir = t.TempDir()
|
|
|
|
defer os.Remove(testDir)
|
2020-12-15 18:51:41 +00:00
|
|
|
cmd.collectDir = testDir
|
|
|
|
|
|
|
|
testFile = "test_nil.json"
|
|
|
|
testPath = filepath.Join(testDir, testFile)
|
|
|
|
|
|
|
|
// Write nil file at top level of collect directory
|
|
|
|
err := cmd.writeBytes("", testFile, testBytes)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.FileExists(t, testPath)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2020-12-15 18:51:41 +00:00
|
|
|
|
|
|
|
var testDir, testFile string
|
|
|
|
var testBytes []byte
|
|
|
|
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir = t.TempDir()
|
2020-12-15 18:51:41 +00:00
|
|
|
defer os.Remove(testDir)
|
|
|
|
|
|
|
|
testFile = "testing.json"
|
|
|
|
testPath := filepath.Join(testDir, testFile)
|
|
|
|
defer os.Remove(testPath)
|
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
|
|
|
// Empty collectDir will always appear to be escaped
|
|
|
|
cmd.collectDir = ""
|
|
|
|
err := cmd.writeBytes(testDir, testFile, testBytes)
|
|
|
|
require.Error(t, err)
|
|
|
|
}
|
2021-11-05 23:43:10 +00:00
|
|
|
|
|
|
|
func TestDebug_CollectConsul(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2021-11-05 23:43:10 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("-short set; skipping")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip test if Consul binary cannot be found
|
|
|
|
clienttest.RequireConsul(t)
|
|
|
|
|
|
|
|
// Create an embedded Consul server
|
|
|
|
testconsul, err := consultest.NewTestServerConfigT(t, func(c *consultest.TestServerConfig) {
|
2023-01-03 14:21:20 +00:00
|
|
|
c.Peering = nil // fix for older versions of Consul (<1.13.0) that don't support peering
|
2021-11-05 23:43:10 +00:00
|
|
|
// If -v wasn't specified squelch consul logging
|
|
|
|
if !testing.Verbose() {
|
|
|
|
c.Stdout = ioutil.Discard
|
|
|
|
c.Stderr = ioutil.Discard
|
|
|
|
}
|
|
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("error starting test consul server: %v", err)
|
|
|
|
}
|
|
|
|
defer testconsul.Stop()
|
|
|
|
|
|
|
|
consulConfig := consulapi.DefaultConfig()
|
|
|
|
consulConfig.Address = testconsul.HTTPAddr
|
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
|
|
|
// Setup Consul *external
|
|
|
|
ce := &external{}
|
|
|
|
ce.setAddr(consulConfig.Address)
|
|
|
|
if ce.ssl {
|
|
|
|
ce.tls = &api.TLSConfig{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set global client
|
|
|
|
c.consul = ce
|
|
|
|
|
|
|
|
// Setup capture directory
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir := t.TempDir()
|
2021-11-05 23:43:10 +00:00
|
|
|
defer os.Remove(testDir)
|
|
|
|
c.collectDir = testDir
|
|
|
|
|
|
|
|
// Collect data from Consul into folder "test"
|
|
|
|
c.collectConsul("test")
|
|
|
|
|
|
|
|
require.Empty(t, ui.ErrorWriter.String())
|
|
|
|
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-host.json"))
|
|
|
|
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-members.json"))
|
|
|
|
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-metrics.json"))
|
|
|
|
require.FileExists(t, filepath.Join(testDir, "test", "consul-leader.json"))
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_CollectVault(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2021-11-05 23:43:10 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("-short set; skipping")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip test if Consul binary cannot be found
|
|
|
|
clienttest.RequireVault(t)
|
|
|
|
|
|
|
|
// Create a Vault server
|
|
|
|
v := testutil.NewTestVault(t)
|
|
|
|
defer v.Stop()
|
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
|
|
|
// Setup Vault *external
|
|
|
|
ve := &external{}
|
|
|
|
ve.tokenVal = v.RootToken
|
|
|
|
ve.setAddr(v.HTTPAddr)
|
|
|
|
if ve.ssl {
|
|
|
|
ve.tls = &api.TLSConfig{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set global client
|
|
|
|
c.vault = ve
|
|
|
|
|
|
|
|
// Set capture directory
|
2022-04-04 19:24:12 +00:00
|
|
|
testDir := t.TempDir()
|
2021-11-05 23:43:10 +00:00
|
|
|
defer os.Remove(testDir)
|
|
|
|
c.collectDir = testDir
|
|
|
|
|
|
|
|
// Collect data from Vault
|
|
|
|
err := c.collectVault("test", "")
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Empty(t, ui.ErrorWriter.String())
|
|
|
|
|
|
|
|
require.FileExists(t, filepath.Join(testDir, "test", "vault-sys-health.json"))
|
|
|
|
}
|
2021-12-15 15:44:03 +00:00
|
|
|
|
2022-01-17 16:15:17 +00:00
|
|
|
// TestDebug_RedirectError asserts that redirect errors are detected so they
|
|
|
|
// can be translated into more understandable output.
|
|
|
|
func TestDebug_RedirectError(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2022-01-17 16:15:17 +00:00
|
|
|
// Create a test server that always returns the error many versions of
|
|
|
|
// Nomad return instead of a 404 for unknown paths.
|
|
|
|
// 1st request redirects to /ui/
|
|
|
|
// 2nd request returns UI's HTML
|
|
|
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
if strings.HasSuffix(r.URL.String(), "/ui/") {
|
|
|
|
fmt.Fprintln(w, `<html>Fake UI HTML</html>`)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
w.Header().Set("Location", "/ui/")
|
|
|
|
w.WriteHeader(307)
|
|
|
|
fmt.Fprintln(w, `<a href="/ui/">Temporary Redirect</a>.`)
|
|
|
|
}))
|
|
|
|
defer ts.Close()
|
|
|
|
|
|
|
|
config := api.DefaultConfig()
|
|
|
|
config.Address = ts.URL
|
|
|
|
client, err := api.NewClient(config)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
resp, err := client.Agent().Host("abc", "", nil)
|
|
|
|
assert.Nil(t, resp)
|
|
|
|
assert.True(t, isRedirectError(err), err.Error())
|
|
|
|
}
|
|
|
|
|
2021-12-15 15:44:03 +00:00
|
|
|
// TestDebug_StaleLeadership verifies that APIs that are required to
|
|
|
|
// complete a debug run have their query options configured with the
|
|
|
|
// -stale flag
|
|
|
|
func TestDebug_StaleLeadership(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
|
2021-12-15 15:44:03 +00:00
|
|
|
srv, _, url := testServerWithoutLeader(t, false, nil)
|
|
|
|
addrServer := srv.HTTPAddr()
|
|
|
|
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("testAgent api address: %s", url)
|
|
|
|
t.Logf("Server api address: %s", addrServer)
|
2021-12-15 15:44:03 +00:00
|
|
|
|
|
|
|
var cases = testCases{
|
|
|
|
{
|
|
|
|
name: "no leader without stale flag",
|
|
|
|
args: []string{"-address", addrServer,
|
|
|
|
"-duration", "250ms", "-interval", "250ms",
|
|
|
|
"-server-id", "all", "-node-id", "all"},
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
expectedCode: 1,
|
|
|
|
expectedError: "No cluster leader",
|
2021-12-15 15:44:03 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "no leader with stale flag",
|
|
|
|
args: []string{
|
|
|
|
"-address", addrServer,
|
|
|
|
"-duration", "250ms", "-interval", "250ms",
|
|
|
|
"-server-id", "all", "-node-id", "all",
|
|
|
|
"-stale"},
|
|
|
|
expectedCode: 0,
|
|
|
|
expectedOutputs: []string{"Created debug archive"},
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
expectedError: "No node(s) with prefix", // still exits 0
|
2021-12-15 15:44:03 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
runTestCases(t, cases)
|
|
|
|
}
|
|
|
|
|
|
|
|
func testServerWithoutLeader(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) {
|
|
|
|
// Make a new test server
|
|
|
|
a := agent.NewTestAgent(t, t.Name(), func(config *agent.Config) {
|
|
|
|
config.Client.Enabled = runClient
|
|
|
|
config.Server.Enabled = true
|
2022-08-17 16:26:34 +00:00
|
|
|
config.Server.NumSchedulers = pointer.Of(0)
|
2021-12-15 15:44:03 +00:00
|
|
|
config.Server.BootstrapExpect = 3
|
|
|
|
|
|
|
|
if cb != nil {
|
|
|
|
cb(config)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
t.Cleanup(func() { a.Shutdown() })
|
|
|
|
|
|
|
|
c := a.Client()
|
|
|
|
return a, c, a.HTTPAddr()
|
|
|
|
}
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// testOutput is used to receive test output from a channel
|
|
|
|
type testOutput struct {
|
|
|
|
name string
|
|
|
|
code int
|
|
|
|
output string
|
|
|
|
error string
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_EventStream_TopicsFromString(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2022-01-18 02:35:51 +00:00
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
topicList string
|
|
|
|
want map[api.Topic][]string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "topics = all",
|
|
|
|
topicList: "all",
|
|
|
|
want: allTopics(),
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "topics = none",
|
|
|
|
topicList: "none",
|
|
|
|
want: nil,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "two topics",
|
|
|
|
topicList: "Deployment,Job",
|
|
|
|
want: map[api.Topic][]string{
|
|
|
|
"Deployment": {"*"},
|
|
|
|
"Job": {"*"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "multiple topics and filters (using api const)",
|
|
|
|
topicList: "Evaluation:example,Job:*,Node:*",
|
|
|
|
want: map[api.Topic][]string{
|
|
|
|
api.TopicEvaluation: {"example"},
|
|
|
|
api.TopicJob: {"*"},
|
|
|
|
api.TopicNode: {"*"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "capitalize topics",
|
|
|
|
topicList: "evaluation:example,job:*,node:*",
|
|
|
|
want: map[api.Topic][]string{
|
|
|
|
api.TopicEvaluation: {"example"},
|
|
|
|
api.TopicJob: {"*"},
|
|
|
|
api.TopicNode: {"*"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "all topics for filterKey",
|
|
|
|
topicList: "*:example",
|
|
|
|
want: map[api.Topic][]string{
|
|
|
|
"*": {"example"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range cases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
got, err := topicsFromString(tc.topicList)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, tc.want, got)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestDebug_EventStream(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2022-01-18 02:35:51 +00:00
|
|
|
// TODO dmay: specify output directory to allow inspection of eventstream.json
|
|
|
|
// TODO dmay: require specific events in the eventstream.json file(s)
|
|
|
|
// TODO dmay: scenario where no events are expected, verify "No events captured"
|
|
|
|
// TODO dmay: verify event topic filtering only includes expected events
|
|
|
|
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
start := time.Now()
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// Start test server
|
|
|
|
srv, client, url := testServer(t, true, nil)
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: test server started, waiting for leadership to establish\n", time.Since(start))
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// Ensure leader is ready
|
|
|
|
testutil.WaitForLeader(t, srv.Agent.RPC)
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Leadership established\n", time.Since(start))
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// Setup mock UI
|
|
|
|
ui := cli.NewMockUi()
|
|
|
|
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
|
|
|
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
// Return command output back to the main test goroutine
|
2022-01-18 02:35:51 +00:00
|
|
|
chOutput := make(chan testOutput)
|
|
|
|
|
|
|
|
// Set duration for capture
|
|
|
|
duration := 5 * time.Second
|
|
|
|
// Fail with timeout if duration is exceeded by 5 seconds
|
|
|
|
timeout := duration + 5*time.Second
|
|
|
|
|
|
|
|
// Run debug in a goroutine so we can start the capture before we run the test job
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Starting nomad operator debug in goroutine\n", time.Since(start))
|
2022-01-18 02:35:51 +00:00
|
|
|
go func() {
|
|
|
|
code := cmd.Run([]string{"-address", url, "-duration", duration.String(), "-interval", "5s", "-event-topic", "Job:*"})
|
|
|
|
assert.Equal(t, 0, code)
|
|
|
|
|
|
|
|
chOutput <- testOutput{
|
|
|
|
name: "yo",
|
|
|
|
code: code,
|
|
|
|
output: ui.OutputWriter.String(),
|
|
|
|
error: ui.ErrorWriter.String(),
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
// Start test job
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Running test job\n", time.Since(start))
|
2022-01-18 02:35:51 +00:00
|
|
|
job := testJob("event_stream_test")
|
|
|
|
resp, _, err := client.Jobs().Register(job, nil)
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: Test job started\n", time.Since(start))
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// Ensure job registered
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// Wait for the job to complete
|
|
|
|
if code := waitForSuccess(ui, client, fullId, t, resp.EvalID); code != 0 {
|
|
|
|
switch code {
|
|
|
|
case 1:
|
|
|
|
t.Fatalf("status code 1: All other failures (API connectivity, internal errors, etc)\n")
|
|
|
|
case 2:
|
|
|
|
t.Fatalf("status code 2: Problem scheduling job (impossible constraints, resources exhausted, etc)\n")
|
|
|
|
default:
|
|
|
|
t.Fatalf("status code non zero saw %d\n", code)
|
|
|
|
}
|
|
|
|
}
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
t.Logf("%s: test job is complete, eval id: %s\n", time.Since(start), resp.EvalID)
|
2022-01-18 02:35:51 +00:00
|
|
|
|
|
|
|
// Capture the output struct from nomad operator debug goroutine
|
|
|
|
var testOut testOutput
|
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
|
|
|
select {
|
|
|
|
case testOut = <-chOutput:
|
|
|
|
t.Logf("%s: goroutine is complete", time.Since(start))
|
|
|
|
case <-time.After(timeout):
|
|
|
|
t.Fatalf("timed out waiting for event stream event (duration: %s, timeout: %s", duration, timeout)
|
2022-01-18 02:35:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
t.Logf("Values from struct -- code: %d, len(out): %d, len(outerr): %d\n", testOut.code, len(testOut.output), len(testOut.error))
|
|
|
|
|
|
|
|
require.Empty(t, testOut.error)
|
|
|
|
|
|
|
|
archive := extractArchiveName(testOut.output)
|
|
|
|
require.NotEmpty(t, archive)
|
|
|
|
fmt.Println(archive)
|
|
|
|
|
|
|
|
// TODO dmay: verify evenstream.json output file contains expected content
|
|
|
|
}
|
|
|
|
|
|
|
|
// extractArchiveName searches string s for the archive filename
|
|
|
|
func extractArchiveName(captureOutput string) string {
|
|
|
|
file := ""
|
|
|
|
|
|
|
|
r := regexp.MustCompile(`Created debug archive: (.+)?\n`)
|
|
|
|
res := r.FindStringSubmatch(captureOutput)
|
|
|
|
// If found, there will be 2 elements, where element [1] is the desired text from the submatch
|
|
|
|
if len(res) == 2 {
|
|
|
|
file = res[1]
|
|
|
|
}
|
|
|
|
|
|
|
|
return file
|
|
|
|
}
|