open-nomad/command/operator_debug_test.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1074 lines
31 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package command
import (
"fmt"
"io"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"regexp"
"strings"
"testing"
"time"
consulapi "github.com/hashicorp/consul/api"
consultest "github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/ci"
clienttest "github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/command/agent"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// NOTE: most of these tests cannot be run in parallel
type testCase struct {
name string
args []string
expectedCode int
expectedOutputs []string
expectedError string
}
type testCases []testCase
func runTestCases(t *testing.T, cases testCases) {
t.Helper()
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
code := cmd.Run(c.args)
out := ui.OutputWriter.String()
outerr := ui.ErrorWriter.String()
assert.Equalf(t, c.expectedCode, code, "did not get expected exit code")
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
if len(c.expectedOutputs) > 0 {
if assert.NotEmpty(t, out, "command output was empty") {
for _, expectedOutput := range c.expectedOutputs {
assert.Contains(t, out, expectedOutput, "did not get expected output")
}
}
} else {
assert.Empty(t, out, "command output should have been empty")
}
if c.expectedError == "" {
assert.Empty(t, outerr, "got unexpected error")
} else {
assert.Containsf(t, outerr, c.expectedError, "did not get expected error")
}
})
}
}
func newClientAgentConfigFunc(region string, nodeClass string, srvRPCAddr string) func(*agent.Config) {
if region == "" {
region = "global"
}
return func(c *agent.Config) {
c.Region = region
c.Client.NodeClass = nodeClass
c.Client.Servers = []string{srvRPCAddr}
c.Client.Enabled = true
c.Server.Enabled = false
}
}
func TestDebug_NodeClass(t *testing.T) {
// Start test server and API client
srv, _, url := testServer(t, false, nil)
// Wait for leadership to establish
testutil.WaitForLeader(t, srv.Agent.RPC)
// Retrieve server RPC address to join clients
srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
// Start test clients
testClient(t, "client1", newClientAgentConfigFunc("global", "classA", srvRPCAddr))
testClient(t, "client2", newClientAgentConfigFunc("global", "classB", srvRPCAddr))
testClient(t, "client3", newClientAgentConfigFunc("global", "classA", srvRPCAddr))
// Setup test cases
cases := testCases{
{
name: "address=api, node-class=classA, max-nodes=2",
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classA", "-max-nodes", "2"},
expectedCode: 0,
expectedOutputs: []string{
"Servers: (1/1)",
"Clients: (2/3)",
"Max node count reached (2)",
"Node Class: classA",
"Created debug archive",
},
expectedError: "",
},
{
name: "address=api, node-class=classB, max-nodes=2",
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classB", "-max-nodes", "2"},
expectedCode: 0,
expectedOutputs: []string{
"Servers: (1/1)",
"Clients: (1/3)",
"Node Class: classB",
"Created debug archive",
},
expectedError: "",
},
}
runTestCases(t, cases)
}
func TestDebug_ClientToServer(t *testing.T) {
// Start test server and API client
srv, _, url := testServer(t, false, nil)
// Wait for leadership to establish
testutil.WaitForLeader(t, srv.Agent.RPC)
// Retrieve server RPC address to join client
srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
// Start client
agent1, _, _ := testClient(t, "client1", newClientAgentConfigFunc("", "", srvRPCAddr))
// Get API addresses
addrServer := srv.HTTPAddr()
addrClient1 := agent1.HTTPAddr()
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("testAgent api address: %s", url)
t.Logf("Server api address: %s", addrServer)
t.Logf("Client1 api address: %s", addrClient1)
// Setup test cases
var cases = testCases{
{
name: "testAgent api server",
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
{
name: "server address",
args: []string{"-address", addrServer, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
{
name: "client1 address - verify no SIGSEGV panic",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
}
runTestCases(t, cases)
}
func TestDebug_MultiRegion(t *testing.T) {
region1 := "region1"
region2 := "region2"
// Start region1 server
server1, _, addrServer1 := testServer(t, false, func(c *agent.Config) { c.Region = region1 })
testutil.WaitForLeader(t, server1.Agent.RPC)
rpcAddrServer1 := server1.GetConfig().AdvertiseAddrs.RPC
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region1, addrServer1, rpcAddrServer1)
// Start region1 client
agent1, _, addrClient1 := testClient(t, "client1", newClientAgentConfigFunc(region1, "", rpcAddrServer1))
nodeIdClient1 := agent1.Agent.Client().NodeID()
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region1, nodeIdClient1, addrClient1)
// Start region2 server
server2, _, addrServer2 := testServer(t, false, func(c *agent.Config) { c.Region = region2 })
testutil.WaitForLeader(t, server2.Agent.RPC)
rpcAddrServer2 := server2.GetConfig().AdvertiseAddrs.RPC
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region2, addrServer2, rpcAddrServer2)
// Start client2
agent2, _, addrClient2 := testClient(t, "client2", newClientAgentConfigFunc(region2, "", rpcAddrServer2))
nodeIdClient2 := agent2.Agent.Client().NodeID()
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region2, nodeIdClient2, addrClient2)
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("Region: %s, Server1 api address: %s", region1, addrServer1)
t.Logf("Region: %s, Client1 api address: %s", region1, addrClient1)
t.Logf("Region: %s, Server2 api address: %s", region2, addrServer2)
t.Logf("Region: %s, Client2 api address: %s", region2, addrClient2)
// Setup test cases
var cases = testCases{
// Good
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{"Starting debugger"},
},
{
name: "region1 - server1 address",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
"Servers: (1/1) [TestDebug_MultiRegion.region1]",
"Clients: (1/1) [" + nodeIdClient1 + "]",
"Created debug archive",
},
},
{
name: "region1 - client1 address",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
"Servers: (1/1) [TestDebug_MultiRegion.region1]",
"Clients: (1/1) [" + nodeIdClient1 + "]",
"Created debug archive",
},
},
{
name: "region2 - server2 address",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
"Servers: (1/1) [TestDebug_MultiRegion.region2]",
"Clients: (1/1) [" + nodeIdClient2 + "]",
"Created debug archive",
},
},
{
name: "region2 - client2 address",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
"Servers: (1/1) [TestDebug_MultiRegion.region2]",
"Clients: (1/1) [" + nodeIdClient2 + "]",
"Created debug archive",
},
},
// Bad
{
name: "invalid region - all servers, all clients",
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
expectedCode: 1,
expectedError: "500 (No path to region)",
},
}
runTestCases(t, cases)
}
func TestDebug_SingleServer(t *testing.T) {
srv, _, url := testServer(t, false, nil)
testutil.WaitForLeader(t, srv.Agent.RPC)
var cases = testCases{
{
name: "address=api, server-id=leader",
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader"},
expectedCode: 0,
expectedOutputs: []string{
"Servers: (1/1)",
"Clients: (0/0)",
"Created debug archive",
},
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
expectedError: "No node(s) with prefix",
},
{
name: "address=api, server-id=all",
args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"},
expectedCode: 0,
expectedOutputs: []string{
"Servers: (1/1)",
"Clients: (0/0)",
"Created debug archive",
},
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
expectedError: "No node(s) with prefix",
},
}
runTestCases(t, cases)
}
func TestDebug_Failures(t *testing.T) {
srv, _, url := testServer(t, false, nil)
testutil.WaitForLeader(t, srv.Agent.RPC)
var cases = testCases{
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "fails incorrect args",
args: []string{"some", "bad", "args"},
expectedCode: 1,
expectedError: "This command takes no arguments",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails illegal node ids",
args: []string{"-node-id", "foo:bar"},
expectedCode: 1,
expectedError: "Error querying node info",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails missing node ids",
args: []string{"-node-id", "abc,def", "-duration", "250ms", "-interval", "250ms"},
expectedCode: 1,
expectedError: "Error querying node info",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails bad durations",
args: []string{"-duration", "foo"},
expectedCode: 1,
expectedError: "Error parsing duration: foo: time: invalid duration \"foo\""},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails bad intervals",
args: []string{"-interval", "bar"},
expectedCode: 1,
expectedError: "Error parsing interval: bar: time: invalid duration \"bar\"",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails intervals greater than duration",
args: []string{"-duration", "5m", "-interval", "10m"},
expectedCode: 1,
expectedError: "Error parsing interval: 10m is greater than duration 5m",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails bad pprof duration",
args: []string{"-pprof-duration", "baz"},
expectedCode: 1,
expectedError: "Error parsing pprof duration: baz: time: invalid duration \"baz\"",
},
{
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
expectedError: "Error parsing pprof-interval: bar: time: invalid duration \"bar\"",
},
{
name: "Fails bad address",
args: []string{"-address", url + "bogus"},
expectedCode: 1,
expectedError: "invalid address",
},
}
runTestCases(t, cases)
}
func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
// Start test server and API client
srv, _, url := testServer(t, false, nil)
// Wait for leadership to establish
testutil.WaitForLeader(t, srv.Agent.RPC)
cases := []string{
"aws/ebs",
"gcp-*-1",
}
for _, pluginName := range cases {
cleanup := state.CreateTestCSIPlugin(srv.Agent.Server().State(), pluginName)
defer cleanup()
}
// Setup mock UI
2020-10-06 13:47:16 +00:00
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
// Debug on the leader and all client nodes
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
assert.Equal(t, 0, code)
// Bad plugin name should be escaped before it reaches the sandbox test
require.NotContains(t, ui.ErrorWriter.String(), "file path escapes capture directory")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
path := cmd.collectDir
var pluginFiles []string
for _, pluginName := range cases {
pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
pluginFiles = append(pluginFiles, pluginFile)
}
testutil.WaitForFiles(t, pluginFiles)
}
func buildPathSlice(path string, files []string) []string {
paths := []string{}
for _, file := range files {
paths = append(paths, filepath.Join(path, file))
}
return paths
}
func TestDebug_CapturedFiles(t *testing.T) {
srv, _, url := testServer(t, true, nil)
testutil.WaitForLeader(t, srv.Agent.RPC)
serverNodeName := srv.Config.NodeName
region := srv.Config.Region
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
clientID := srv.Agent.Client().NodeID()
testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region())
t.Logf("serverName: %s, clientID, %s", serverName, clientID)
// Setup file slices
clusterFiles := []string{
"agent-self.json",
"members.json",
"namespaces.json",
"regions.json",
}
pprofFiles := []string{
"allocs.prof",
"goroutine-debug1.txt",
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile_0000.prof",
"threadcreate.prof",
"trace.prof",
}
clientFiles := []string{
"agent-host.json",
"monitor.log",
}
clientFiles = append(clientFiles, pprofFiles...)
serverFiles := []string{
"agent-host.json",
"monitor.log",
}
serverFiles = append(serverFiles, pprofFiles...)
intervalFiles := []string{
"allocations.json",
"csi-plugins.json",
"csi-volumes.json",
"deployments.json",
"evaluations.json",
"jobs.json",
"license.json",
"metrics.json",
"nodes.json",
"operator-autopilot-health.json",
"operator-raft.json",
"operator-scheduler.json",
}
2020-10-06 13:47:16 +00:00
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
duration := 2 * time.Second
interval := 750 * time.Millisecond
waitTime := 2 * duration
code := cmd.Run([]string{
"-address", url,
"-output", testDir,
"-server-id", serverName,
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
})
// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
ui.ErrorWriter.Reset()
// Verify cluster files
clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
t.Logf("Waiting for cluster files in path: %s", clusterDir)
testutil.WaitForFilesUntil(t, clusterPaths, waitTime)
// Verify client files
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
t.Logf("Waiting for client files in path: %s", clientDir)
testutil.WaitForFilesUntil(t, clientPaths, waitTime)
// Verify server files
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
t.Logf("Waiting for server files in path: %s", serverDir)
testutil.WaitForFilesUntil(t, serverPaths, waitTime)
// Verify interval 0000 files
intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths0, waitTime)
// Verify interval 0001 files
intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths1, waitTime)
}
func TestDebug_ExistingOutput(t *testing.T) {
ci.Parallel(t)
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Fails existing output
format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
tempDir := t.TempDir()
path := filepath.Join(tempDir, stamped)
os.MkdirAll(path, 0755)
defer os.Remove(tempDir)
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
require.Equal(t, 2, code)
}
func TestDebug_Fail_Pprof(t *testing.T) {
// Setup agent config with debug endpoints disabled
agentConfFunc := func(c *agent.Config) {
c.EnableDebug = false
}
// Start test server and API client
srv, _, url := testServer(t, false, agentConfFunc)
// Wait for leadership to establish
testutil.WaitForLeader(t, srv.Agent.RPC)
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Debug on server with endpoints disabled
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"})
assert.Equal(t, 0, code) // Pprof failure isn't fatal
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
require.Contains(t, ui.ErrorWriter.String(), "Failed to retrieve pprof") // Should report pprof failure
require.Contains(t, ui.ErrorWriter.String(), "Permission denied") // Specifically permission denied
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
}
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
// filtered by the version constraint.
func TestDebug_PprofVersionCheck(t *testing.T) {
cases := []struct {
version string
errMsg string
}{
{"0.8.7", ""},
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.3", ""},
{"0.11.3+ent", ""},
{"0.12.0", ""},
{"1.3.0", ""},
{"foo.bar", "error: Malformed version: foo.bar"},
}
for _, tc := range cases {
t.Run(tc.version, func(t *testing.T) {
err := checkVersion(tc.version, minimumVersionPprofConstraint)
if tc.errMsg == "" {
require.NoError(t, err, "expected no error from %s", tc.version)
} else {
require.EqualError(t, err, tc.errMsg)
}
})
}
}
func TestDebug_StringToSlice(t *testing.T) {
ci.Parallel(t)
cases := []struct {
input string
expected []string
}{
{input: ",,", expected: []string(nil)},
{input: "", expected: []string(nil)},
{input: "foo, bar", expected: []string{"foo", "bar"}},
{input: " foo, bar ", expected: []string{"foo", "bar"}},
{input: "foo,,bar", expected: []string{"foo", "bar"}},
}
for _, tc := range cases {
out := stringToSlice(tc.input)
require.Equal(t, tc.expected, out)
}
}
func TestDebug_External(t *testing.T) {
ci.Parallel(t)
// address calculation honors CONSUL_HTTP_SSL
// ssl: true - Correct alignment
e := &external{addrVal: "https://127.0.0.1:8500", ssl: true}
addr := e.addr("foo")
require.Equal(t, "https://127.0.0.1:8500", addr)
// ssl: true - protocol incorrect
// NOTE: Address with protocol now overrides ssl flag
e = &external{addrVal: "http://127.0.0.1:8500", ssl: true}
addr = e.addr("foo")
require.Equal(t, "http://127.0.0.1:8500", addr)
// ssl: true - protocol missing
e = &external{addrVal: "127.0.0.1:8500", ssl: true}
addr = e.addr("foo")
require.Equal(t, "https://127.0.0.1:8500", addr)
// ssl: false - correct alignment
e = &external{addrVal: "http://127.0.0.1:8500", ssl: false}
addr = e.addr("foo")
require.Equal(t, "http://127.0.0.1:8500", addr)
// ssl: false - protocol incorrect
// NOTE: Address with protocol now overrides ssl flag
e = &external{addrVal: "https://127.0.0.1:8500", ssl: false}
addr = e.addr("foo")
require.Equal(t, "https://127.0.0.1:8500", addr)
// ssl: false - protocol missing
e = &external{addrVal: "127.0.0.1:8500", ssl: false}
addr = e.addr("foo")
require.Equal(t, "http://127.0.0.1:8500", addr)
// Address through proxy might not have a port
e = &external{addrVal: "https://127.0.0.1", ssl: true}
addr = e.addr("foo")
require.Equal(t, "https://127.0.0.1", addr)
}
func TestDebug_WriteBytes_Nil(t *testing.T) {
ci.Parallel(t)
var testDir, testFile, testPath string
var testBytes []byte
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir = t.TempDir()
defer os.Remove(testDir)
cmd.collectDir = testDir
testFile = "test_nil.json"
testPath = filepath.Join(testDir, testFile)
// Write nil file at top level of collect directory
err := cmd.writeBytes("", testFile, testBytes)
require.NoError(t, err)
require.FileExists(t, testPath)
}
func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
ci.Parallel(t)
var testDir, testFile string
var testBytes []byte
testDir = t.TempDir()
defer os.Remove(testDir)
testFile = "testing.json"
testPath := filepath.Join(testDir, testFile)
defer os.Remove(testPath)
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Empty collectDir will always appear to be escaped
cmd.collectDir = ""
err := cmd.writeBytes(testDir, testFile, testBytes)
require.Error(t, err)
}
func TestDebug_CollectConsul(t *testing.T) {
ci.Parallel(t)
if testing.Short() {
t.Skip("-short set; skipping")
}
// Skip test if Consul binary cannot be found
clienttest.RequireConsul(t)
// Create an embedded Consul server
testconsul, err := consultest.NewTestServerConfigT(t, func(c *consultest.TestServerConfig) {
c.Peering = nil // fix for older versions of Consul (<1.13.0) that don't support peering
// If -v wasn't specified squelch consul logging
if !testing.Verbose() {
c.Stdout = io.Discard
c.Stderr = io.Discard
}
})
require.NoError(t, err)
if err != nil {
t.Fatalf("error starting test consul server: %v", err)
}
defer testconsul.Stop()
consulConfig := consulapi.DefaultConfig()
consulConfig.Address = testconsul.HTTPAddr
// Setup mock UI
ui := cli.NewMockUi()
c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Setup Consul *external
ce := &external{}
ce.setAddr(consulConfig.Address)
if ce.ssl {
ce.tls = &api.TLSConfig{}
}
// Set global client
c.consul = ce
// Setup capture directory
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir
// Collect data from Consul into folder "test"
c.collectConsul("test")
require.Empty(t, ui.ErrorWriter.String())
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-host.json"))
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-members.json"))
require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-metrics.json"))
require.FileExists(t, filepath.Join(testDir, "test", "consul-leader.json"))
}
func TestDebug_CollectVault(t *testing.T) {
ci.Parallel(t)
if testing.Short() {
t.Skip("-short set; skipping")
}
// Skip test if Consul binary cannot be found
clienttest.RequireVault(t)
// Create a Vault server
v := testutil.NewTestVault(t)
defer v.Stop()
// Setup mock UI
ui := cli.NewMockUi()
c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Setup Vault *external
ve := &external{}
ve.tokenVal = v.RootToken
ve.setAddr(v.HTTPAddr)
if ve.ssl {
ve.tls = &api.TLSConfig{}
}
// Set global client
c.vault = ve
// Set capture directory
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir
// Collect data from Vault
err := c.collectVault("test", "")
require.NoError(t, err)
require.Empty(t, ui.ErrorWriter.String())
require.FileExists(t, filepath.Join(testDir, "test", "vault-sys-health.json"))
}
// TestDebug_RedirectError asserts that redirect errors are detected so they
// can be translated into more understandable output.
func TestDebug_RedirectError(t *testing.T) {
ci.Parallel(t)
// Create a test server that always returns the error many versions of
// Nomad return instead of a 404 for unknown paths.
// 1st request redirects to /ui/
// 2nd request returns UI's HTML
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.String(), "/ui/") {
fmt.Fprintln(w, `<html>Fake UI HTML</html>`)
return
}
w.Header().Set("Location", "/ui/")
w.WriteHeader(http.StatusTemporaryRedirect)
fmt.Fprintln(w, `<a href="/ui/">Temporary Redirect</a>.`)
}))
defer ts.Close()
config := api.DefaultConfig()
config.Address = ts.URL
client, err := api.NewClient(config)
require.NoError(t, err)
resp, err := client.Agent().Host("abc", "", nil)
assert.Nil(t, resp)
assert.True(t, isRedirectError(err), err.Error())
}
// TestDebug_StaleLeadership verifies that APIs that are required to
// complete a debug run have their query options configured with the
// -stale flag
func TestDebug_StaleLeadership(t *testing.T) {
srv, _, url := testServerWithoutLeader(t, false, nil)
addrServer := srv.HTTPAddr()
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("testAgent api address: %s", url)
t.Logf("Server api address: %s", addrServer)
var cases = testCases{
{
name: "no leader without stale flag",
args: []string{"-address", addrServer,
"-duration", "250ms", "-interval", "250ms",
"-server-id", "all", "-node-id", "all"},
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
expectedCode: 1,
expectedError: "No cluster leader",
},
{
name: "no leader with stale flag",
args: []string{
"-address", addrServer,
"-duration", "250ms", "-interval", "250ms",
"-server-id", "all", "-node-id", "all",
"-stale"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
expectedError: "No node(s) with prefix", // still exits 0
},
}
runTestCases(t, cases)
}
func testServerWithoutLeader(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) {
// Make a new test server
a := agent.NewTestAgent(t, t.Name(), func(config *agent.Config) {
config.Client.Enabled = runClient
config.Server.Enabled = true
config.Server.NumSchedulers = pointer.Of(0)
config.Server.BootstrapExpect = 3
if cb != nil {
cb(config)
}
})
t.Cleanup(func() { a.Shutdown() })
c := a.Client()
return a, c, a.HTTPAddr()
}
// testOutput is used to receive test output from a channel
type testOutput struct {
name string
code int
output string
error string
}
func TestDebug_EventStream_TopicsFromString(t *testing.T) {
ci.Parallel(t)
cases := []struct {
name string
topicList string
want map[api.Topic][]string
}{
{
name: "topics = all",
topicList: "all",
want: allTopics(),
},
{
name: "topics = none",
topicList: "none",
want: nil,
},
{
name: "two topics",
topicList: "Deployment,Job",
want: map[api.Topic][]string{
"Deployment": {"*"},
"Job": {"*"},
},
},
{
name: "multiple topics and filters (using api const)",
topicList: "Evaluation:example,Job:*,Node:*",
want: map[api.Topic][]string{
api.TopicEvaluation: {"example"},
api.TopicJob: {"*"},
api.TopicNode: {"*"},
},
},
{
name: "capitalize topics",
topicList: "evaluation:example,job:*,node:*",
want: map[api.Topic][]string{
api.TopicEvaluation: {"example"},
api.TopicJob: {"*"},
api.TopicNode: {"*"},
},
},
{
name: "all topics for filterKey",
topicList: "*:example",
want: map[api.Topic][]string{
"*": {"example"},
},
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got, err := topicsFromString(tc.topicList)
require.NoError(t, err)
require.Equal(t, tc.want, got)
})
}
}
func TestDebug_EventStream(t *testing.T) {
ci.Parallel(t)
// TODO dmay: specify output directory to allow inspection of eventstream.json
// TODO dmay: require specific events in the eventstream.json file(s)
// TODO dmay: scenario where no events are expected, verify "No events captured"
// TODO dmay: verify event topic filtering only includes expected events
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
start := time.Now()
// Start test server
srv, client, url := testServer(t, true, nil)
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: test server started, waiting for leadership to establish\n", time.Since(start))
// Ensure leader is ready
testutil.WaitForLeader(t, srv.Agent.RPC)
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Leadership established\n", time.Since(start))
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
// Return command output back to the main test goroutine
chOutput := make(chan testOutput)
// Set duration for capture
duration := 5 * time.Second
// Fail with timeout if duration is exceeded by 5 seconds
timeout := duration + 5*time.Second
// Run debug in a goroutine so we can start the capture before we run the test job
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Starting nomad operator debug in goroutine\n", time.Since(start))
go func() {
code := cmd.Run([]string{"-address", url, "-duration", duration.String(), "-interval", "5s", "-event-topic", "Job:*"})
assert.Equal(t, 0, code)
chOutput <- testOutput{
name: "yo",
code: code,
output: ui.OutputWriter.String(),
error: ui.ErrorWriter.String(),
}
}()
// Start test job
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Running test job\n", time.Since(start))
job := testJob("event_stream_test")
resp, _, err := client.Jobs().Register(job, nil)
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: Test job started\n", time.Since(start))
// Ensure job registered
require.NoError(t, err)
// Wait for the job to complete
if code := waitForSuccess(ui, client, fullId, t, resp.EvalID); code != 0 {
switch code {
case 1:
t.Fatalf("status code 1: All other failures (API connectivity, internal errors, etc)\n")
case 2:
t.Fatalf("status code 2: Problem scheduling job (impossible constraints, resources exhausted, etc)\n")
default:
t.Fatalf("status code non zero saw %d\n", code)
}
}
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
t.Logf("%s: test job is complete, eval id: %s\n", time.Since(start), resp.EvalID)
// Capture the output struct from nomad operator debug goroutine
var testOut testOutput
Fix flaky `operator debug` test (#12501) We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
2022-04-07 19:00:07 +00:00
select {
case testOut = <-chOutput:
t.Logf("%s: goroutine is complete", time.Since(start))
case <-time.After(timeout):
t.Fatalf("timed out waiting for event stream event (duration: %s, timeout: %s", duration, timeout)
}
t.Logf("Values from struct -- code: %d, len(out): %d, len(outerr): %d\n", testOut.code, len(testOut.output), len(testOut.error))
require.Empty(t, testOut.error)
archive := extractArchiveName(testOut.output)
require.NotEmpty(t, archive)
fmt.Println(archive)
// TODO dmay: verify evenstream.json output file contains expected content
}
// extractArchiveName searches string s for the archive filename
func extractArchiveName(captureOutput string) string {
file := ""
r := regexp.MustCompile(`Created debug archive: (.+)?\n`)
res := r.FindStringSubmatch(captureOutput)
// If found, there will be 2 elements, where element [1] is the desired text from the submatch
if len(res) == 2 {
file = res[1]
}
return file
}