cli: rename paths in debug bundle for clarity (#11307)
* Rename folders to reflect purpose * Improve captured files test coverage * Rename CSI plugins output file * Add changelog entry * fix test and make changelog message more explicit Co-authored-by: Luiz Aoqui <luiz@hashicorp.com>
This commit is contained in:
parent
fa4df28fcd
commit
c37a6ed583
|
@ -0,0 +1,3 @@
|
|||
```release-note:breaking-change
|
||||
cli: Renamed folders in `nomad operator debug` bundle for clarity
|
||||
```
|
|
@ -50,6 +50,10 @@ type OperatorDebugCommand struct {
|
|||
|
||||
const (
|
||||
userAgent = "nomad operator debug"
|
||||
clusterDir = "cluster"
|
||||
clientDir = "client"
|
||||
serverDir = "server"
|
||||
intervalDir = "interval"
|
||||
)
|
||||
|
||||
func (c *OperatorDebugCommand) Help() string {
|
||||
|
@ -458,7 +462,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
}
|
||||
|
||||
// Write complete list of server members to file
|
||||
c.writeJSON("version", "members.json", members, err)
|
||||
c.writeJSON(clusterDir, "members.json", members, err)
|
||||
|
||||
// Filter for servers matching criteria
|
||||
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
|
||||
|
@ -538,18 +542,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
|
||||
// collect collects data from our endpoints and writes the archive bundle
|
||||
func (c *OperatorDebugCommand) collect(client *api.Client) error {
|
||||
// Version contains cluster meta information
|
||||
dir := "version"
|
||||
// Collect cluster data
|
||||
|
||||
self, err := client.Agent().Self()
|
||||
c.writeJSON(dir, "agent-self.json", self, err)
|
||||
c.writeJSON(clusterDir, "agent-self.json", self, err)
|
||||
|
||||
var qo *api.QueryOptions
|
||||
namespaces, _, err := client.Namespaces().List(qo)
|
||||
c.writeJSON(dir, "namespaces.json", namespaces, err)
|
||||
c.writeJSON(clusterDir, "namespaces.json", namespaces, err)
|
||||
|
||||
regions, err := client.Regions().List()
|
||||
c.writeJSON(dir, "regions.json", regions, err)
|
||||
c.writeJSON(clusterDir, "regions.json", regions, err)
|
||||
|
||||
// Fetch data directly from consul and vault. Ignore errors
|
||||
var consul, vault string
|
||||
|
@ -582,8 +585,8 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
|
|||
}
|
||||
}
|
||||
|
||||
c.collectConsul(dir, consul)
|
||||
c.collectVault(dir, vault)
|
||||
c.collectConsul(clusterDir, consul)
|
||||
c.collectVault(clusterDir, vault)
|
||||
c.collectAgentHosts(client)
|
||||
c.collectPprofs(client)
|
||||
|
||||
|
@ -616,11 +619,11 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {
|
|||
// startMonitors starts go routines for each node and client
|
||||
func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
|
||||
for _, id := range c.nodeIDs {
|
||||
go c.startMonitor("client", "node_id", id, client)
|
||||
go c.startMonitor(clientDir, "node_id", id, client)
|
||||
}
|
||||
|
||||
for _, id := range c.serverIDs {
|
||||
go c.startMonitor("server", "server_id", id, client)
|
||||
go c.startMonitor(serverDir, "server_id", id, client)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -664,11 +667,11 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
|
|||
// collectAgentHosts calls collectAgentHost for each selected node
|
||||
func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
|
||||
for _, n := range c.nodeIDs {
|
||||
c.collectAgentHost("client", n, client)
|
||||
c.collectAgentHost(clientDir, n, client)
|
||||
}
|
||||
|
||||
for _, n := range c.serverIDs {
|
||||
c.collectAgentHost("server", n, client)
|
||||
c.collectAgentHost(serverDir, n, client)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -676,7 +679,7 @@ func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
|
|||
func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) {
|
||||
var host *api.HostDataResponse
|
||||
var err error
|
||||
if path == "server" {
|
||||
if path == serverDir {
|
||||
host, err = client.Agent().Host(id, "", nil)
|
||||
} else {
|
||||
host, err = client.Agent().Host("", id, nil)
|
||||
|
@ -699,11 +702,11 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
|
|||
// collectPprofs captures the /agent/pprof for each listed node
|
||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
|
||||
for _, n := range c.nodeIDs {
|
||||
c.collectPprof("client", n, client)
|
||||
c.collectPprof(clientDir, n, client)
|
||||
}
|
||||
|
||||
for _, n := range c.serverIDs {
|
||||
c.collectPprof("server", n, client)
|
||||
c.collectPprof(serverDir, n, client)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -711,7 +714,7 @@ func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
|
|||
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
|
||||
pprofDurationSeconds := int(c.pprofDuration.Seconds())
|
||||
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
|
||||
if path == "server" {
|
||||
if path == serverDir {
|
||||
opts.ServerID = id
|
||||
} else {
|
||||
opts.NodeID = id
|
||||
|
@ -810,7 +813,7 @@ func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) {
|
|||
|
||||
case <-interval:
|
||||
name = fmt.Sprintf("%04d", intervalCount)
|
||||
dir = filepath.Join("nomad", name)
|
||||
dir = filepath.Join(intervalDir, name)
|
||||
c.Ui.Output(fmt.Sprintf(" Capture interval %s", name))
|
||||
c.collectNomad(dir, client)
|
||||
c.collectOperator(dir, client)
|
||||
|
@ -859,7 +862,7 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro
|
|||
|
||||
// CSI Plugins - /v1/plugins?type=csi
|
||||
ps, _, err := client.CSIPlugins().List(qo)
|
||||
c.writeJSON(dir, "plugins.json", ps, err)
|
||||
c.writeJSON(dir, "csi-plugins.json", ps, err)
|
||||
|
||||
// CSI Plugin details - /v1/plugin/csi/:plugin_id
|
||||
for _, p := range ps {
|
||||
|
|
|
@ -346,68 +346,126 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
|
|||
var pluginFiles []string
|
||||
for _, pluginName := range cases {
|
||||
pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
|
||||
pluginFile = filepath.Join(path, "nomad", "0000", pluginFile)
|
||||
pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
|
||||
pluginFiles = append(pluginFiles, pluginFile)
|
||||
}
|
||||
|
||||
testutil.WaitForFiles(t, pluginFiles)
|
||||
}
|
||||
|
||||
func buildPathSlice(path string, files []string) []string {
|
||||
paths := []string{}
|
||||
for _, file := range files {
|
||||
paths = append(paths, filepath.Join(path, file))
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
func TestDebug_CapturedFiles(t *testing.T) {
|
||||
srv, _, url := testServer(t, false, nil)
|
||||
srv, _, url := testServer(t, true, nil)
|
||||
testutil.WaitForLeader(t, srv.Agent.RPC)
|
||||
|
||||
serverNodeName := srv.Config.NodeName
|
||||
region := srv.Config.Region
|
||||
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
|
||||
clientID := srv.Agent.Client().NodeID()
|
||||
|
||||
t.Logf("serverName: %s, clientID, %s", serverName, clientID)
|
||||
|
||||
// Setup file slices
|
||||
clusterFiles := []string{
|
||||
"agent-self.json",
|
||||
"consul-agent-members.json",
|
||||
"consul-agent-self.json",
|
||||
"members.json",
|
||||
"namespaces.json",
|
||||
"regions.json",
|
||||
"vault-sys-health.json",
|
||||
}
|
||||
|
||||
pprofFiles := []string{
|
||||
"allocs.prof",
|
||||
"goroutine-debug1.txt",
|
||||
"goroutine-debug2.txt",
|
||||
"goroutine.prof",
|
||||
"heap.prof",
|
||||
"profile.prof",
|
||||
"threadcreate.prof",
|
||||
"trace.prof",
|
||||
}
|
||||
|
||||
clientFiles := []string{
|
||||
"agent-host.json",
|
||||
"monitor.log",
|
||||
}
|
||||
clientFiles = append(clientFiles, pprofFiles...)
|
||||
|
||||
serverFiles := []string{
|
||||
"agent-host.json",
|
||||
"monitor.log",
|
||||
}
|
||||
serverFiles = append(serverFiles, pprofFiles...)
|
||||
|
||||
intervalFiles := []string{
|
||||
"allocations.json",
|
||||
"csi-plugins.json",
|
||||
"csi-volumes.json",
|
||||
"deployments.json",
|
||||
"evaluations.json",
|
||||
"jobs.json",
|
||||
"license.json",
|
||||
"metrics.json",
|
||||
"nodes.json",
|
||||
"operator-autopilot-health.json",
|
||||
"operator-raft.json",
|
||||
"operator-scheduler.json",
|
||||
}
|
||||
|
||||
ui := cli.NewMockUi()
|
||||
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
||||
|
||||
code := cmd.Run([]string{
|
||||
"-address", url,
|
||||
"-output", os.TempDir(),
|
||||
"-server-id", "leader",
|
||||
"-server-id", serverName,
|
||||
"-node-id", clientID,
|
||||
"-duration", "1300ms",
|
||||
"-interval", "600ms",
|
||||
})
|
||||
|
||||
// Get capture directory
|
||||
path := cmd.collectDir
|
||||
defer os.Remove(path)
|
||||
|
||||
// There should be no errors
|
||||
require.Empty(t, ui.ErrorWriter.String())
|
||||
require.Equal(t, 0, code)
|
||||
ui.ErrorWriter.Reset()
|
||||
|
||||
serverFiles := []string{
|
||||
// Version is always captured
|
||||
filepath.Join(path, "version", "agent-self.json"),
|
||||
// Verify cluster files
|
||||
clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
|
||||
t.Logf("Waiting for cluster files in path: %s", clusterDir)
|
||||
testutil.WaitForFilesUntil(t, clusterPaths, 2*time.Minute)
|
||||
|
||||
// Consul and Vault contain results or errors
|
||||
filepath.Join(path, "version", "consul-agent-self.json"),
|
||||
filepath.Join(path, "version", "vault-sys-health.json"),
|
||||
// Verify client files
|
||||
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
|
||||
t.Logf("Waiting for client files in path: %s", clientDir)
|
||||
testutil.WaitForFilesUntil(t, clientPaths, 2*time.Minute)
|
||||
|
||||
// Monitor files are only created when selected
|
||||
filepath.Join(path, "server", "leader", "monitor.log"),
|
||||
// Verify server files
|
||||
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
|
||||
t.Logf("Waiting for server files in path: %s", serverDir)
|
||||
testutil.WaitForFilesUntil(t, serverPaths, 2*time.Minute)
|
||||
|
||||
// Pprof profiles
|
||||
filepath.Join(path, "server", "leader", "profile.prof"),
|
||||
filepath.Join(path, "server", "leader", "trace.prof"),
|
||||
filepath.Join(path, "server", "leader", "goroutine.prof"),
|
||||
filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
|
||||
filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
|
||||
filepath.Join(path, "server", "leader", "heap.prof"),
|
||||
filepath.Join(path, "server", "leader", "allocs.prof"),
|
||||
filepath.Join(path, "server", "leader", "threadcreate.prof"),
|
||||
// Verify interval 0000 files
|
||||
intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
|
||||
t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
|
||||
testutil.WaitForFilesUntil(t, intervalPaths0, 2*time.Minute)
|
||||
|
||||
// Multiple snapshots are collected, 00 is always created
|
||||
filepath.Join(path, "nomad", "0000", "jobs.json"),
|
||||
filepath.Join(path, "nomad", "0000", "nodes.json"),
|
||||
filepath.Join(path, "nomad", "0000", "metrics.json"),
|
||||
|
||||
// Multiple snapshots are collected, 01 requires two intervals
|
||||
filepath.Join(path, "nomad", "0001", "jobs.json"),
|
||||
filepath.Join(path, "nomad", "0001", "nodes.json"),
|
||||
filepath.Join(path, "nomad", "0001", "metrics.json"),
|
||||
}
|
||||
|
||||
testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
|
||||
// Verify interval 0001 files
|
||||
intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
|
||||
t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
|
||||
testutil.WaitForFilesUntil(t, intervalPaths1, 2*time.Minute)
|
||||
}
|
||||
|
||||
func TestDebug_ExistingOutput(t *testing.T) {
|
||||
|
|
|
@ -241,7 +241,7 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL
|
|||
// WaitForFiles blocks until all the files in the slice are present
|
||||
func WaitForFiles(t testing.TB, files []string) {
|
||||
WaitForResult(func() (bool, error) {
|
||||
return FilesExist(files), nil
|
||||
return FilesExist(files)
|
||||
}, func(err error) {
|
||||
t.Fatalf("missing expected files: %v", err)
|
||||
})
|
||||
|
@ -250,18 +250,18 @@ func WaitForFiles(t testing.TB, files []string) {
|
|||
// WaitForFilesUntil blocks until duration or all the files in the slice are present
|
||||
func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
|
||||
WaitForResultUntil(until, func() (bool, error) {
|
||||
return FilesExist(files), nil
|
||||
return FilesExist(files)
|
||||
}, func(err error) {
|
||||
t.Fatalf("missing expected files: %v", err)
|
||||
})
|
||||
}
|
||||
|
||||
// FilesExist verifies all files in the slice are present
|
||||
func FilesExist(files []string) bool {
|
||||
func FilesExist(files []string) (bool, error) {
|
||||
for _, f := range files {
|
||||
if _, err := os.Stat(f); os.IsNotExist(err) {
|
||||
return false
|
||||
return false, fmt.Errorf("expected file not found: %v", f)
|
||||
}
|
||||
}
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue