open-nomad/command/operator_debug_test.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package command

import (
	"fmt"
	"io"
	"net/http"
	"net/http/httptest"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"testing"
	"time"

	consulapi "github.com/hashicorp/consul/api"
	consultest "github.com/hashicorp/consul/sdk/testutil"
	"github.com/hashicorp/nomad/api"
	"github.com/hashicorp/nomad/ci"
	clienttest "github.com/hashicorp/nomad/client/testutil"
	"github.com/hashicorp/nomad/command/agent"
	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/helper/pointer"
	"github.com/hashicorp/nomad/nomad/state"
	"github.com/hashicorp/nomad/testutil"
	"github.com/mitchellh/cli"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

// NOTE: most of these tests cannot be run in parallel

type testCase struct {
	name            string
	args            []string
	expectedCode    int
	expectedOutputs []string
	expectedError   string
}

type testCases []testCase

func runTestCases(t *testing.T, cases testCases) {
	t.Helper()
	for _, c := range cases {
		t.Run(c.name, func(t *testing.T) {
			ui := cli.NewMockUi()
			cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

			code := cmd.Run(c.args)
			out := ui.OutputWriter.String()
			outerr := ui.ErrorWriter.String()

			assert.Equalf(t, c.expectedCode, code, "did not get expected exit code")

			if len(c.expectedOutputs) > 0 {
				if assert.NotEmpty(t, out, "command output was empty") {
					for _, expectedOutput := range c.expectedOutputs {
						assert.Contains(t, out, expectedOutput, "did not get expected output")
					}
				}
			} else {
				assert.Empty(t, out, "command output should have been empty")
			}

			if c.expectedError == "" {
				assert.Empty(t, outerr, "got unexpected error")
			} else {
				assert.Containsf(t, outerr, c.expectedError, "did not get expected error")
			}
		})
	}
}
func newClientAgentConfigFunc(region string, nodeClass string, srvRPCAddr string) func(*agent.Config) {
	if region == "" {
		region = "global"
	}

	return func(c *agent.Config) {
		c.Region = region
		c.Client.NodeClass = nodeClass
		c.Client.Servers = []string{srvRPCAddr}
		c.Client.Enabled = true
		c.Server.Enabled = false
	}
}

func TestDebug_NodeClass(t *testing.T) {

	// Start test server and API client
	srv, _, url := testServer(t, false, nil)

	// Wait for leadership to establish
	testutil.WaitForLeader(t, srv.Agent.RPC)

	// Retrieve server RPC address to join clients
	srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
	t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)

	// Start test clients
	testClient(t, "client1", newClientAgentConfigFunc("global", "classA", srvRPCAddr))
	testClient(t, "client2", newClientAgentConfigFunc("global", "classB", srvRPCAddr))
	testClient(t, "client3", newClientAgentConfigFunc("global", "classA", srvRPCAddr))

	// Setup test cases
	cases := testCases{
		{
			name:         "address=api, node-class=classA, max-nodes=2",
			args:         []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classA", "-max-nodes", "2"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Servers: (1/1)",
				"Clients: (2/3)",
				"Max node count reached (2)",
				"Node Class: classA",
				"Created debug archive",
			},
			expectedError: "",
		},
		{
			name:         "address=api, node-class=classB, max-nodes=2",
			args:         []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "classB", "-max-nodes", "2"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Servers: (1/1)",
				"Clients: (1/3)",
				"Node Class: classB",
				"Created debug archive",
			},
			expectedError: "",
		},
	}

	runTestCases(t, cases)
}

func TestDebug_ClientToServer(t *testing.T) {

	// Start test server and API client
	srv, _, url := testServer(t, false, nil)

	// Wait for leadership to establish
	testutil.WaitForLeader(t, srv.Agent.RPC)

	// Retrieve server RPC address to join client
	srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
	t.Logf("Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)

	// Start client
	agent1, _, _ := testClient(t, "client1", newClientAgentConfigFunc("", "", srvRPCAddr))

	// Get API addresses
	addrServer := srv.HTTPAddr()
	addrClient1 := agent1.HTTPAddr()

	t.Logf("testAgent api address: %s", url)
	t.Logf("Server    api address: %s", addrServer)
	t.Logf("Client1   api address: %s", addrClient1)

	// Setup test cases
	var cases = testCases{
		{
			name:            "testAgent api server",
			args:            []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
			expectedCode:    0,
			expectedOutputs: []string{"Created debug archive"},
		},
		{
			name:            "server address",
			args:            []string{"-address", addrServer, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
			expectedCode:    0,
			expectedOutputs: []string{"Created debug archive"},
		},
		{
			name:            "client1 address - verify no SIGSEGV panic",
			args:            []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
			expectedCode:    0,
			expectedOutputs: []string{"Created debug archive"},
		},
	}

	runTestCases(t, cases)
}

func TestDebug_MultiRegion(t *testing.T) {

	region1 := "region1"
	region2 := "region2"

	// Start region1 server
	server1, _, addrServer1 := testServer(t, false, func(c *agent.Config) { c.Region = region1 })
	testutil.WaitForLeader(t, server1.Agent.RPC)
	rpcAddrServer1 := server1.GetConfig().AdvertiseAddrs.RPC
	t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region1, addrServer1, rpcAddrServer1)

	// Start region1 client
	agent1, _, addrClient1 := testClient(t, "client1", newClientAgentConfigFunc(region1, "", rpcAddrServer1))
	nodeIdClient1 := agent1.Agent.Client().NodeID()
	t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region1, nodeIdClient1, addrClient1)

	// Start region2 server
	server2, _, addrServer2 := testServer(t, false, func(c *agent.Config) { c.Region = region2 })
	testutil.WaitForLeader(t, server2.Agent.RPC)
	rpcAddrServer2 := server2.GetConfig().AdvertiseAddrs.RPC
	t.Logf("%s: Leader started, HTTPAddr: %s, RPC: %s", region2, addrServer2, rpcAddrServer2)

	// Start client2
	agent2, _, addrClient2 := testClient(t, "client2", newClientAgentConfigFunc(region2, "", rpcAddrServer2))
	nodeIdClient2 := agent2.Agent.Client().NodeID()
	t.Logf("%s: Client1 started, ID: %s, HTTPAddr: %s", region2, nodeIdClient2, addrClient2)

	t.Logf("Region: %s, Server1   api address: %s", region1, addrServer1)
	t.Logf("Region: %s, Client1   api address: %s", region1, addrClient1)
	t.Logf("Region: %s, Server2   api address: %s", region2, addrServer2)
	t.Logf("Region: %s, Client2   api address: %s", region2, addrClient2)

	// Setup test cases
	var cases = testCases{
		// Good
		{
			name:            "no region - all servers, all clients",
			args:            []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
			expectedCode:    0,
			expectedOutputs: []string{"Starting debugger"},
		},
		{
			name:         "region1 - server1 address",
			args:         []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Region: " + region1 + "\n",
				"Servers: (1/1) [TestDebug_MultiRegion.region1]",
				"Clients: (1/1) [" + nodeIdClient1 + "]",
				"Created debug archive",
			},
		},
		{
			name:         "region1 - client1 address",
			args:         []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Region: " + region1 + "\n",
				"Servers: (1/1) [TestDebug_MultiRegion.region1]",
				"Clients: (1/1) [" + nodeIdClient1 + "]",
				"Created debug archive",
			},
		},
		{
			name:         "region2 - server2 address",
			args:         []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Region: " + region2 + "\n",
				"Servers: (1/1) [TestDebug_MultiRegion.region2]",
				"Clients: (1/1) [" + nodeIdClient2 + "]",
				"Created debug archive",
			},
		},
		{
			name:         "region2 - client2 address",
			args:         []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Region: " + region2 + "\n",
				"Servers: (1/1) [TestDebug_MultiRegion.region2]",
				"Clients: (1/1) [" + nodeIdClient2 + "]",
				"Created debug archive",
			},
		},

		// Bad
		{
			name:          "invalid region - all servers, all clients",
			args:          []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all"},
			expectedCode:  1,
			expectedError: "500 (No path to region)",
		},
	}

	runTestCases(t, cases)
}

func TestDebug_SingleServer(t *testing.T) {

	srv, _, url := testServer(t, false, nil)
	testutil.WaitForLeader(t, srv.Agent.RPC)

	var cases = testCases{
		{
			name:         "address=api, server-id=leader",
			args:         []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Servers: (1/1)",
				"Clients: (0/0)",
				"Created debug archive",
			},
			expectedError: "No node(s) with prefix",
		},
		{
			name:         "address=api, server-id=all",
			args:         []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"},
			expectedCode: 0,
			expectedOutputs: []string{
				"Servers: (1/1)",
				"Clients: (0/0)",
				"Created debug archive",
			},
			expectedError: "No node(s) with prefix",
		},
	}

	runTestCases(t, cases)
}

func TestDebug_Failures(t *testing.T) {

	srv, _, url := testServer(t, false, nil)
	testutil.WaitForLeader(t, srv.Agent.RPC)

	var cases = testCases{
		{
			name:          "fails incorrect args",
			args:          []string{"some", "bad", "args"},
			expectedCode:  1,
			expectedError: "This command takes no arguments",
		},
		{
			name:          "Fails illegal node ids",
			args:          []string{"-node-id", "foo:bar"},
			expectedCode:  1,
			expectedError: "Error querying node info",
		},
		{
			name:          "Fails missing node ids",
			args:          []string{"-node-id", "abc,def", "-duration", "250ms", "-interval", "250ms"},
			expectedCode:  1,
			expectedError: "Error querying node info",
		},
		{
			name:          "Fails bad durations",
			args:          []string{"-duration", "foo"},
			expectedCode:  1,
			expectedError: "Error parsing duration: foo: time: invalid duration \"foo\""},
		{
			name:          "Fails bad intervals",
			args:          []string{"-interval", "bar"},
			expectedCode:  1,
			expectedError: "Error parsing interval: bar: time: invalid duration \"bar\"",
		},
		{
			name:          "Fails intervals greater than duration",
			args:          []string{"-duration", "5m", "-interval", "10m"},
			expectedCode:  1,
			expectedError: "Error parsing interval: 10m is greater than duration 5m",
		},
		{
			name:          "Fails bad pprof duration",
			args:          []string{"-pprof-duration", "baz"},
			expectedCode:  1,
			expectedError: "Error parsing pprof duration: baz: time: invalid duration \"baz\"",
		},
		{
			name:          "Fails bad pprof interval",
			args:          []string{"-pprof-interval", "bar"},
			expectedCode:  1,
			expectedError: "Error parsing pprof-interval: bar: time: invalid duration \"bar\"",
		},
		{
			name:          "Fails bad address",
			args:          []string{"-address", url + "bogus"},
			expectedCode:  1,
			expectedError: "invalid address",
		},
	}

	runTestCases(t, cases)
}

func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {

	// Start test server and API client
	srv, _, url := testServer(t, false, nil)

	// Wait for leadership to establish
	testutil.WaitForLeader(t, srv.Agent.RPC)

	cases := []string{
		"aws/ebs",
		"gcp-*-1",
	}
	for _, pluginName := range cases {
		cleanup := state.CreateTestCSIPlugin(srv.Agent.Server().State(), pluginName)
		defer cleanup()
	}

	// Setup mock UI
	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
	testDir := t.TempDir()
	defer os.Remove(testDir)

	// Debug on the leader and all client nodes
	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
	assert.Equal(t, 0, code)

	// Bad plugin name should be escaped before it reaches the sandbox test
	require.NotContains(t, ui.ErrorWriter.String(), "file path escapes capture directory")
	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")

	path := cmd.collectDir

	var pluginFiles []string
	for _, pluginName := range cases {
		pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
		pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
		pluginFiles = append(pluginFiles, pluginFile)
	}

	testutil.WaitForFiles(t, pluginFiles)
}

func buildPathSlice(path string, files []string) []string {
	paths := []string{}
	for _, file := range files {
		paths = append(paths, filepath.Join(path, file))
	}
	return paths
}

func TestDebug_CapturedFiles(t *testing.T) {
	srv, _, url := testServer(t, true, nil)
	testutil.WaitForLeader(t, srv.Agent.RPC)

	serverNodeName := srv.Config.NodeName
	region := srv.Config.Region
	serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
	clientID := srv.Agent.Client().NodeID()
	testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region())

	t.Logf("serverName: %s, clientID, %s", serverName, clientID)

	// Setup file slices
	clusterFiles := []string{
		"agent-self.json",
		"members.json",
		"namespaces.json",
		"regions.json",
	}

	pprofFiles := []string{
		"allocs.prof",
		"goroutine-debug1.txt",
		"goroutine-debug2.txt",
		"goroutine.prof",
		"heap.prof",
		"profile_0000.prof",
		"threadcreate.prof",
		"trace.prof",
	}

	clientFiles := []string{
		"agent-host.json",
		"monitor.log",
	}
	clientFiles = append(clientFiles, pprofFiles...)

	serverFiles := []string{
		"agent-host.json",
		"monitor.log",
	}
	serverFiles = append(serverFiles, pprofFiles...)

	intervalFiles := []string{
		"allocations.json",
		"csi-plugins.json",
		"csi-volumes.json",
		"deployments.json",
		"evaluations.json",
		"jobs.json",
		"license.json",
		"metrics.json",
		"nodes.json",
		"operator-autopilot-health.json",
		"operator-raft.json",
		"operator-scheduler.json",
	}

	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
	testDir := t.TempDir()
	defer os.Remove(testDir)

	duration := 2 * time.Second
	interval := 750 * time.Millisecond
	waitTime := 2 * duration

	code := cmd.Run([]string{
		"-address", url,
		"-output", testDir,
		"-server-id", serverName,
		"-node-id", clientID,
		"-duration", duration.String(),
		"-interval", interval.String(),
	})

	// There should be no errors
	require.Empty(t, ui.ErrorWriter.String())
	require.Equal(t, 0, code)
	ui.ErrorWriter.Reset()

	// Verify cluster files
	clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
	t.Logf("Waiting for cluster files in path: %s", clusterDir)
	testutil.WaitForFilesUntil(t, clusterPaths, waitTime)

	// Verify client files
	clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
	t.Logf("Waiting for client files in path: %s", clientDir)
	testutil.WaitForFilesUntil(t, clientPaths, waitTime)

	// Verify server files
	serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
	t.Logf("Waiting for server files in path: %s", serverDir)
	testutil.WaitForFilesUntil(t, serverPaths, waitTime)

	// Verify interval 0000 files
	intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
	t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
	testutil.WaitForFilesUntil(t, intervalPaths0, waitTime)

	// Verify interval 0001 files
	intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
	t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
	testutil.WaitForFilesUntil(t, intervalPaths1, waitTime)
}

func TestDebug_ExistingOutput(t *testing.T) {
	ci.Parallel(t)

	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Fails existing output
	format := "2006-01-02-150405Z"
	stamped := "nomad-debug-" + time.Now().UTC().Format(format)
	tempDir := t.TempDir()
	path := filepath.Join(tempDir, stamped)
	os.MkdirAll(path, 0755)
	defer os.Remove(tempDir)

	code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
	require.Equal(t, 2, code)
}

func TestDebug_Fail_Pprof(t *testing.T) {

	// Setup agent config with debug endpoints disabled
	agentConfFunc := func(c *agent.Config) {
		c.EnableDebug = false
	}

	// Start test server and API client
	srv, _, url := testServer(t, false, agentConfFunc)

	// Wait for leadership to establish
	testutil.WaitForLeader(t, srv.Agent.RPC)

	// Setup mock UI
	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Debug on server with endpoints disabled
	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"})

	assert.Equal(t, 0, code) // Pprof failure isn't fatal
	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
	require.Contains(t, ui.ErrorWriter.String(), "Failed to retrieve pprof") // Should report pprof failure
	require.Contains(t, ui.ErrorWriter.String(), "Permission denied")        // Specifically permission denied
	require.Contains(t, ui.OutputWriter.String(), "Created debug archive")   // Archive should be generated anyway
}

// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
// filtered by the version constraint.
func TestDebug_PprofVersionCheck(t *testing.T) {
	cases := []struct {
		version string
		errMsg  string
	}{
		{"0.8.7", ""},
		{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
		{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
		{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
		{"0.11.3", ""},
		{"0.11.3+ent", ""},
		{"0.12.0", ""},
		{"1.3.0", ""},
		{"foo.bar", "error: Malformed version: foo.bar"},
	}

	for _, tc := range cases {
		t.Run(tc.version, func(t *testing.T) {
			err := checkVersion(tc.version, minimumVersionPprofConstraint)
			if tc.errMsg == "" {
				require.NoError(t, err, "expected no error from %s", tc.version)
			} else {
				require.EqualError(t, err, tc.errMsg)
			}
		})
	}
}

func TestDebug_StringToSlice(t *testing.T) {
	ci.Parallel(t)

	cases := []struct {
		input    string
		expected []string
	}{
		{input: ",,", expected: []string(nil)},
		{input: "", expected: []string(nil)},
		{input: "foo, bar", expected: []string{"foo", "bar"}},
		{input: "  foo, bar ", expected: []string{"foo", "bar"}},
		{input: "foo,,bar", expected: []string{"foo", "bar"}},
	}
	for _, tc := range cases {
		out := stringToSlice(tc.input)
		require.Equal(t, tc.expected, out)
	}
}

func TestDebug_External(t *testing.T) {
	ci.Parallel(t)

	// address calculation honors CONSUL_HTTP_SSL
	// ssl: true - Correct alignment
	e := &external{addrVal: "https://127.0.0.1:8500", ssl: true}
	addr := e.addr("foo")
	require.Equal(t, "https://127.0.0.1:8500", addr)

	// ssl: true - protocol incorrect
	// NOTE: Address with protocol now overrides ssl flag
	e = &external{addrVal: "http://127.0.0.1:8500", ssl: true}
	addr = e.addr("foo")
	require.Equal(t, "http://127.0.0.1:8500", addr)

	// ssl: true - protocol missing
	e = &external{addrVal: "127.0.0.1:8500", ssl: true}
	addr = e.addr("foo")
	require.Equal(t, "https://127.0.0.1:8500", addr)

	// ssl: false - correct alignment
	e = &external{addrVal: "http://127.0.0.1:8500", ssl: false}
	addr = e.addr("foo")
	require.Equal(t, "http://127.0.0.1:8500", addr)

	// ssl: false - protocol incorrect
	// NOTE: Address with protocol now overrides ssl flag
	e = &external{addrVal: "https://127.0.0.1:8500", ssl: false}
	addr = e.addr("foo")
	require.Equal(t, "https://127.0.0.1:8500", addr)

	// ssl: false - protocol missing
	e = &external{addrVal: "127.0.0.1:8500", ssl: false}
	addr = e.addr("foo")
	require.Equal(t, "http://127.0.0.1:8500", addr)

	// Address through proxy might not have a port
	e = &external{addrVal: "https://127.0.0.1", ssl: true}
	addr = e.addr("foo")
	require.Equal(t, "https://127.0.0.1", addr)
}

func TestDebug_WriteBytes_Nil(t *testing.T) {
	ci.Parallel(t)

	var testDir, testFile, testPath string
	var testBytes []byte

	// Setup mock UI
	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	testDir = t.TempDir()
	defer os.Remove(testDir)
	cmd.collectDir = testDir

	testFile = "test_nil.json"
	testPath = filepath.Join(testDir, testFile)

	// Write nil file at top level of collect directory
	err := cmd.writeBytes("", testFile, testBytes)
	require.NoError(t, err)
	require.FileExists(t, testPath)
}

func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
	ci.Parallel(t)

	var testDir, testFile string
	var testBytes []byte

	testDir = t.TempDir()
	defer os.Remove(testDir)

	testFile = "testing.json"
	testPath := filepath.Join(testDir, testFile)
	defer os.Remove(testPath)

	// Setup mock UI
	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Empty collectDir will always appear to be escaped
	cmd.collectDir = ""
	err := cmd.writeBytes(testDir, testFile, testBytes)
	require.Error(t, err)
}

func TestDebug_CollectConsul(t *testing.T) {
	ci.Parallel(t)
	if testing.Short() {
		t.Skip("-short set; skipping")
	}

	// Skip test if Consul binary cannot be found
	clienttest.RequireConsul(t)

	// Create an embedded Consul server
	testconsul, err := consultest.NewTestServerConfigT(t, func(c *consultest.TestServerConfig) {
		c.Peering = nil // fix for older versions of Consul (<1.13.0) that don't support peering
		// If -v wasn't specified squelch consul logging
		if !testing.Verbose() {
			c.Stdout = io.Discard
			c.Stderr = io.Discard
		}
	})
	require.NoError(t, err)
	if err != nil {
		t.Fatalf("error starting test consul server: %v", err)
	}
	defer testconsul.Stop()

	consulConfig := consulapi.DefaultConfig()
	consulConfig.Address = testconsul.HTTPAddr

	// Setup mock UI
	ui := cli.NewMockUi()
	c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Setup Consul *external
	ce := &external{}
	ce.setAddr(consulConfig.Address)
	if ce.ssl {
		ce.tls = &api.TLSConfig{}
	}

	// Set global client
	c.consul = ce

	// Setup capture directory
	testDir := t.TempDir()
	defer os.Remove(testDir)
	c.collectDir = testDir

	// Collect data from Consul into folder "test"
	c.collectConsul("test")

	require.Empty(t, ui.ErrorWriter.String())
	require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-host.json"))
	require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-members.json"))
	require.FileExists(t, filepath.Join(testDir, "test", "consul-agent-metrics.json"))
	require.FileExists(t, filepath.Join(testDir, "test", "consul-leader.json"))
}

func TestDebug_CollectVault(t *testing.T) {
	ci.Parallel(t)
	if testing.Short() {
		t.Skip("-short set; skipping")
	}

	// Skip test if Consul binary cannot be found
	clienttest.RequireVault(t)

	// Create a Vault server
	v := testutil.NewTestVault(t)
	defer v.Stop()

	// Setup mock UI
	ui := cli.NewMockUi()
	c := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Setup Vault *external
	ve := &external{}
	ve.tokenVal = v.RootToken
	ve.setAddr(v.HTTPAddr)
	if ve.ssl {
		ve.tls = &api.TLSConfig{}
	}

	// Set global client
	c.vault = ve

	// Set capture directory
	testDir := t.TempDir()
	defer os.Remove(testDir)
	c.collectDir = testDir

	// Collect data from Vault
	err := c.collectVault("test", "")

	require.NoError(t, err)
	require.Empty(t, ui.ErrorWriter.String())

	require.FileExists(t, filepath.Join(testDir, "test", "vault-sys-health.json"))
}

// TestDebug_RedirectError asserts that redirect errors are detected so they
// can be translated into more understandable output.
func TestDebug_RedirectError(t *testing.T) {
	ci.Parallel(t)

	// Create a test server that always returns the error many versions of
	// Nomad return instead of a 404 for unknown paths.
	// 1st request redirects to /ui/
	// 2nd request returns UI's HTML
	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if strings.HasSuffix(r.URL.String(), "/ui/") {
			fmt.Fprintln(w, `<html>Fake UI HTML</html>`)
			return
		}

		w.Header().Set("Location", "/ui/")
		w.WriteHeader(http.StatusTemporaryRedirect)
		fmt.Fprintln(w, `<a href="/ui/">Temporary Redirect</a>.`)
	}))
	defer ts.Close()

	config := api.DefaultConfig()
	config.Address = ts.URL
	client, err := api.NewClient(config)
	require.NoError(t, err)

	resp, err := client.Agent().Host("abc", "", nil)
	assert.Nil(t, resp)
	assert.True(t, isRedirectError(err), err.Error())
}

// TestDebug_StaleLeadership verifies that APIs that are required to
// complete a debug run have their query options configured with the
// -stale flag
func TestDebug_StaleLeadership(t *testing.T) {

	srv, _, url := testServerWithoutLeader(t, false, nil)
	addrServer := srv.HTTPAddr()

	t.Logf("testAgent api address: %s", url)
	t.Logf("Server    api address: %s", addrServer)

	var cases = testCases{
		{
			name: "no leader without stale flag",
			args: []string{"-address", addrServer,
				"-duration", "250ms", "-interval", "250ms",
				"-server-id", "all", "-node-id", "all"},
			expectedCode:  1,
			expectedError: "No cluster leader",
		},
		{
			name: "no leader with stale flag",
			args: []string{
				"-address", addrServer,
				"-duration", "250ms", "-interval", "250ms",
				"-server-id", "all", "-node-id", "all",
				"-stale"},
			expectedCode:    0,
			expectedOutputs: []string{"Created debug archive"},
			expectedError:   "No node(s) with prefix", // still exits 0
		},
	}

	runTestCases(t, cases)
}

func testServerWithoutLeader(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) {
	// Make a new test server
	a := agent.NewTestAgent(t, t.Name(), func(config *agent.Config) {
		config.Client.Enabled = runClient
		config.Server.Enabled = true
		config.Server.NumSchedulers = pointer.Of(0)
		config.Server.BootstrapExpect = 3

		if cb != nil {
			cb(config)
		}
	})
	t.Cleanup(func() { a.Shutdown() })

	c := a.Client()
	return a, c, a.HTTPAddr()
}

// testOutput is used to receive test output from a channel
type testOutput struct {
	name   string
	code   int
	output string
	error  string
}

func TestDebug_EventStream_TopicsFromString(t *testing.T) {
	ci.Parallel(t)

	cases := []struct {
		name      string
		topicList string
		want      map[api.Topic][]string
	}{
		{
			name:      "topics = all",
			topicList: "all",
			want:      allTopics(),
		},
		{
			name:      "topics = none",
			topicList: "none",
			want:      nil,
		},
		{
			name:      "two topics",
			topicList: "Deployment,Job",
			want: map[api.Topic][]string{
				"Deployment": {"*"},
				"Job":        {"*"},
			},
		},
		{
			name:      "multiple topics and filters (using api const)",
			topicList: "Evaluation:example,Job:*,Node:*",
			want: map[api.Topic][]string{
				api.TopicEvaluation: {"example"},
				api.TopicJob:        {"*"},
				api.TopicNode:       {"*"},
			},
		},
		{
			name:      "capitalize topics",
			topicList: "evaluation:example,job:*,node:*",
			want: map[api.Topic][]string{
				api.TopicEvaluation: {"example"},
				api.TopicJob:        {"*"},
				api.TopicNode:       {"*"},
			},
		},
		{
			name:      "all topics for filterKey",
			topicList: "*:example",
			want: map[api.Topic][]string{
				"*": {"example"},
			},
		},
	}

	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			got, err := topicsFromString(tc.topicList)
			require.NoError(t, err)
			require.Equal(t, tc.want, got)
		})
	}
}

func TestDebug_EventStream(t *testing.T) {
	ci.Parallel(t)

	// TODO dmay: specify output directory to allow inspection of eventstream.json
	// TODO dmay: require specific events in the eventstream.json file(s)
	// TODO dmay: scenario where no events are expected, verify "No events captured"
	// TODO dmay: verify event topic filtering only includes expected events

	start := time.Now()

	// Start test server
	srv, client, url := testServer(t, true, nil)
	t.Logf("%s: test server started, waiting for leadership to establish\n", time.Since(start))

	// Ensure leader is ready
	testutil.WaitForLeader(t, srv.Agent.RPC)
	t.Logf("%s: Leadership established\n", time.Since(start))

	// Setup mock UI
	ui := cli.NewMockUi()
	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

	// Return command output back to the main test goroutine
	chOutput := make(chan testOutput)

	// Set duration for capture
	duration := 5 * time.Second
	// Fail with timeout if duration is exceeded by 5 seconds
	timeout := duration + 5*time.Second

	// Run debug in a goroutine so we can start the capture before we run the test job
	t.Logf("%s: Starting nomad operator debug in goroutine\n", time.Since(start))
	go func() {
		code := cmd.Run([]string{"-address", url, "-duration", duration.String(), "-interval", "5s", "-event-topic", "Job:*"})
		assert.Equal(t, 0, code)

		chOutput <- testOutput{
			name:   "yo",
			code:   code,
			output: ui.OutputWriter.String(),
			error:  ui.ErrorWriter.String(),
		}
	}()

	// Start test job
	t.Logf("%s: Running test job\n", time.Since(start))
	job := testJob("event_stream_test")
	resp, _, err := client.Jobs().Register(job, nil)
	t.Logf("%s: Test job started\n", time.Since(start))

	// Ensure job registered
	require.NoError(t, err)

	// Wait for the job to complete
	if code := waitForSuccess(ui, client, fullId, t, resp.EvalID); code != 0 {
		switch code {
		case 1:
			t.Fatalf("status code 1: All other failures (API connectivity, internal errors, etc)\n")
		case 2:
			t.Fatalf("status code 2: Problem scheduling job (impossible constraints, resources exhausted, etc)\n")
		default:
			t.Fatalf("status code non zero saw %d\n", code)
		}
	}
	t.Logf("%s: test job is complete, eval id: %s\n", time.Since(start), resp.EvalID)

	// Capture the output struct from nomad operator debug goroutine
	var testOut testOutput
	select {
	case testOut = <-chOutput:
		t.Logf("%s: goroutine is complete", time.Since(start))
	case <-time.After(timeout):
		t.Fatalf("timed out waiting for event stream event (duration: %s, timeout: %s", duration, timeout)
	}

	t.Logf("Values from struct -- code: %d, len(out): %d, len(outerr): %d\n", testOut.code, len(testOut.output), len(testOut.error))

	require.Empty(t, testOut.error)

	archive := extractArchiveName(testOut.output)
	require.NotEmpty(t, archive)
	fmt.Println(archive)

	// TODO dmay: verify evenstream.json output file contains expected content
}

// extractArchiveName searches string s for the archive filename
func extractArchiveName(captureOutput string) string {
	file := ""

	r := regexp.MustCompile(`Created debug archive: (.+)?\n`)
	res := r.FindStringSubmatch(captureOutput)
	// If found, there will be 2 elements, where element [1] is the desired text from the submatch
	if len(res) == 2 {
		file = res[1]
	}

	return file
}