From e7e8ce212eecac6b5e43ba24524ff08ee9e68659 Mon Sep 17 00:00:00 2001 From: Danish Prakash Date: Tue, 5 Apr 2022 00:54:12 +0530 Subject: [PATCH] command/operator_debug: add pprof interval (#11938) --- .changelog/11938.txt | 3 + command/operator_debug.go | 66 ++++++++++++++++--- command/operator_debug_test.go | 54 ++++++++------- .../content/docs/commands/operator/debug.mdx | 6 +- 4 files changed, 96 insertions(+), 33 deletions(-) create mode 100644 .changelog/11938.txt diff --git a/.changelog/11938.txt b/.changelog/11938.txt new file mode 100644 index 000000000..fd86718bc --- /dev/null +++ b/.changelog/11938.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: added `-pprof-interval` to `nomad operator debug` command +``` diff --git a/command/operator_debug.go b/command/operator_debug.go index 446ddc03e..79da977c7 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -38,6 +38,7 @@ type OperatorDebugCommand struct { collectDir string duration time.Duration interval time.Duration + pprofInterval time.Duration pprofDuration time.Duration logLevel string maxNodes int @@ -184,6 +185,10 @@ Debug Options: -pprof-duration= Duration for pprof collection. Defaults to 1s. + -pprof-interval= + The interval between pprof collections. Set interval equal to + duration to capture a single snapshot. Defaults to 250ms. + -server-id=, Comma separated list of Nomad server names to monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or "all". @@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } - var duration, interval, output, pprofDuration, eventTopic string + var duration, interval, pprofInterval, output, pprofDuration, eventTopic string var eventIndex int64 var nodeIDs, serverIDs string var allowStale bool @@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.BoolVar(&allowStale, "stale", false, "") flags.StringVar(&output, "output", "", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") + flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "") flags.BoolVar(&c.verbose, "verbose", false, "") c.consul = &external{tls: &api.TLSConfig{}} @@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 1 } + // Parse the pprof capture interval + pi, err := time.ParseDuration(pprofInterval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error())) + return 1 + } + c.pprofInterval = pi + // Parse the pprof capture duration pd, err := time.ParseDuration(pprofDuration) if err != nil { @@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.pprofDuration = pd + // Validate pprof interval + if pi.Seconds() > pd.Seconds() { + c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration)) + return 1 + } + // Parse event stream topic filter t, err := topicsFromString(eventTopic) if err != nil { @@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) + c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval)) if c.pprofDuration.Seconds() != 1 { c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration)) } @@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error { c.collectVault(clusterDir, vaultAddr) c.collectAgentHosts(client) - c.collectPprofs(client) + go c.collectPeriodicPprofs(client) c.collectPeriodic(client) @@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli c.writeJSON(path, "agent-host.json", host, err) } +func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { + duration := time.After(c.pprofDuration) + // Create a ticker to execute on every interval ticks + ticker := time.NewTicker(c.pprofInterval) + + var pprofIntervalCount int + var name string + + // Additionally, an out of loop execute to imitate first tick + c.collectPprofs(client, pprofIntervalCount) + + for { + select { + case <-duration: + return + + case <-ticker.C: + name = fmt.Sprintf("%04d", pprofIntervalCount) + c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name)) + c.collectPprofs(client, pprofIntervalCount) + pprofIntervalCount++ + + case <-c.ctx.Done(): + return + } + } +} + // collectPprofs captures the /agent/pprof for each listed node -func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { +func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) { for _, n := range c.nodeIDs { - c.collectPprof(clientDir, n, client) + c.collectPprof(clientDir, n, client, interval) } for _, n := range c.serverIDs { - c.collectPprof(serverDir, n, client) + c.collectPprof(serverDir, n, client, interval) } } // collectPprof captures pprof data for the node -func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) { +func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) { pprofDurationSeconds := int(c.pprofDuration.Seconds()) opts := api.PprofOptions{Seconds: pprofDurationSeconds} if path == serverDir { @@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) } path = filepath.Join(path, id) + filename := fmt.Sprintf("profile_%04d.prof", interval) bs, err := client.Agent().CPUProfile(opts, c.queryOpts()) if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err)) + c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err)) if structs.IsErrPermissionDenied(err) { // All Profiles require the same permissions, so we only need to see // one permission failure before we bail. @@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) return // only exit on 403 } } else { - err := c.writeBytes(path, "profile.prof", bs) + err := c.writeBytes(path, filename, bs) if err != nil { c.Ui.Error(err.Error()) } diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index afd09489f..8d1026b49 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) { }, { name: "client1 address - verify no SIGSEGV panic", - args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, @@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) { // Good { name: "no region - all servers, all clients", - args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 0, }, { name: "region1 - server1 address", - args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 0, expectedOutputs: []string{ "Region: " + region1 + "\n", @@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) { }, { name: "region1 - client1 address", - args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 0, expectedOutputs: []string{ "Region: " + region1 + "\n", @@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) { }, { name: "region2 - server2 address", - args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 0, expectedOutputs: []string{ "Region: " + region2 + "\n", @@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) { }, { name: "region2 - client2 address", - args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 0, expectedOutputs: []string{ "Region: " + region2 + "\n", @@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) { // Bad { name: "invalid region - all servers, all clients", - args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, + args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"}, expectedCode: 1, expectedError: "500 (No path to region)", }, @@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) { args: []string{"-pprof-duration", "baz"}, expectedCode: 1, }, + { + name: "Fails bad pprof interval", + args: []string{"-pprof-interval", "bar"}, + expectedCode: 1, + }, { name: "Fails bad address", args: []string{"-address", url + "bogus"}, @@ -379,9 +384,11 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) { // Setup mock UI ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} + testDir := t.TempDir() + defer os.Remove(testDir) // Debug on the leader and all client nodes - code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir}) assert.Equal(t, 0, code) // Bad plugin name should be escaped before it reaches the sandbox test @@ -389,7 +396,6 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) { require.Contains(t, ui.OutputWriter.String(), "Starting debugger") path := cmd.collectDir - defer os.Remove(path) var pluginFiles []string for _, pluginName := range cases { @@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) { "goroutine-debug2.txt", "goroutine.prof", "heap.prof", - "profile.prof", + "profile_0000.prof", "threadcreate.prof", "trace.prof", } @@ -470,6 +476,8 @@ func TestDebug_CapturedFiles(t *testing.T) { ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} + testDir := t.TempDir() + defer os.Remove(testDir) duration := 2 * time.Second interval := 750 * time.Millisecond @@ -477,18 +485,15 @@ func TestDebug_CapturedFiles(t *testing.T) { code := cmd.Run([]string{ "-address", url, - "-output", os.TempDir(), + "-output", testDir, "-server-id", serverName, "-node-id", clientID, "-duration", duration.String(), "-interval", interval.String(), - "-pprof-duration", "0", + "-pprof-duration", "1s", + "-pprof-interval", "250ms", }) - // Get capture directory - path := cmd.collectDir - defer os.Remove(path) - // There should be no errors require.Empty(t, ui.ErrorWriter.String()) require.Equal(t, 0, code) @@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) { // Fails existing output format := "2006-01-02-150405Z" stamped := "nomad-debug-" + time.Now().UTC().Format(format) - path := filepath.Join(os.TempDir(), stamped) + tempDir := t.TempDir() + path := filepath.Join(tempDir, stamped) os.MkdirAll(path, 0755) - defer os.Remove(path) + defer os.Remove(tempDir) - code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"}) + code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"}) require.Equal(t, 2, code) } @@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) { ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} - testDir = os.TempDir() + testDir = t.TempDir() + defer os.Remove(testDir) cmd.collectDir = testDir testFile = "test_nil.json" testPath = filepath.Join(testDir, testFile) - defer os.Remove(testPath) // Write nil file at top level of collect directory err := cmd.writeBytes("", testFile, testBytes) @@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) { var testDir, testFile string var testBytes []byte - testDir = os.TempDir() + testDir = t.TempDir() defer os.Remove(testDir) testFile = "testing.json" @@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) { c.consul = ce // Setup capture directory - testDir := os.TempDir() + testDir := t.TempDir() defer os.Remove(testDir) c.collectDir = testDir @@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) { c.vault = ve // Set capture directory - testDir := os.TempDir() + testDir := t.TempDir() defer os.Remove(testDir) c.collectDir = testDir diff --git a/website/content/docs/commands/operator/debug.mdx b/website/content/docs/commands/operator/debug.mdx index d6507ee72..a583e2bfc 100644 --- a/website/content/docs/commands/operator/debug.mdx +++ b/website/content/docs/commands/operator/debug.mdx @@ -66,6 +66,10 @@ true. - `-pprof-duration=`: Duration for pprof collection. Defaults to 1s. +- `-pprof-interval=`: The interval between pprof + collections. Set interval equal to duration to capture a single + snapshot. Defaults to 250ms. + - `-server-id=,`: Comma separated list of Nomad server names to monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or "all". Defaults to `all`. @@ -75,7 +79,7 @@ true. leadership, it may be necessary to get the configuration from a non-leader server. -- `-event-topic=:`: Enable event +- `-event-topic=:`: Enable event stream capture. Filter by comma delimited list of topic filters or "all". Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for additional detail.