command/operator_debug: add pprof interval (#11938)

2022-04-05 00:54:12 +05:30 · 2022-04-05 00:54:12 +05:30 · e7e8ce212e
parent a1f1294dc4
commit e7e8ce212e
4 changed files with 96 additions and 33 deletions
--- a/.changelog/11938.txt
+++ b/.changelog/11938.txt
@ -0,0 +1,3 @@
+```release-note:improvement
+cli: added `-pprof-interval` to `nomad operator debug` command
+```
--- a/command/operator_debug.go
+++ b/command/operator_debug.go
@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
 	collectDir    string
 	duration      time.Duration
 	interval      time.Duration
+	pprofInterval time.Duration
 	pprofDuration time.Duration
 	logLevel      string
 	maxNodes      int
@ -184,6 +185,10 @@ Debug Options:
  -pprof-duration=<duration>
    Duration for pprof collection. Defaults to 1s.

+  -pprof-interval=<pprof-interval>
+    The interval between pprof collections. Set interval equal to
+    duration to capture a single snapshot. Defaults to 250ms.
+
  -server-id=<server1>,<server2>
    Comma separated list of Nomad server names to monitor for logs, API
    outputs, and pprof profiles. Accepts server names, "leader", or "all".
@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
 	flags.Usage = func() { c.Ui.Output(c.Help()) }

-	var duration, interval, output, pprofDuration, eventTopic string
+	var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
 	var eventIndex int64
 	var nodeIDs, serverIDs string
 	var allowStale bool
@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	flags.BoolVar(&allowStale, "stale", false, "")
 	flags.StringVar(&output, "output", "", "")
 	flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
+	flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
 	flags.BoolVar(&c.verbose, "verbose", false, "")

 	c.consul = &external{tls: &api.TLSConfig{}}
@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 1
 	}

+	// Parse the pprof capture interval
+	pi, err := time.ParseDuration(pprofInterval)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
+		return 1
+	}
+	c.pprofInterval = pi
+
 	// Parse the pprof capture duration
 	pd, err := time.ParseDuration(pprofDuration)
 	if err != nil {
@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	}
 	c.pprofDuration = pd

+	// Validate pprof interval
+	if pi.Seconds() > pd.Seconds() {
+		c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
+		return 1
+	}
+
 	// Parse event stream topic filter
 	t, err := topicsFromString(eventTopic)
 	if err != nil {
@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	}
 	c.Ui.Output(fmt.Sprintf("         Interval: %s", interval))
 	c.Ui.Output(fmt.Sprintf("         Duration: %s", duration))
+	c.Ui.Output(fmt.Sprintf("         pprof Interval: %s", pprofInterval))
 	if c.pprofDuration.Seconds() != 1 {
 		c.Ui.Output(fmt.Sprintf("   pprof Duration: %s", c.pprofDuration))
 	}
@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
 	c.collectVault(clusterDir, vaultAddr)

 	c.collectAgentHosts(client)
-	c.collectPprofs(client)
+	go c.collectPeriodicPprofs(client)

 	c.collectPeriodic(client)

@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
 	c.writeJSON(path, "agent-host.json", host, err)
 }

+func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
+	duration := time.After(c.pprofDuration)
+	// Create a ticker to execute on every interval ticks
+	ticker := time.NewTicker(c.pprofInterval)
+
+	var pprofIntervalCount int
+	var name string
+
+	// Additionally, an out of loop execute to imitate first tick
+	c.collectPprofs(client, pprofIntervalCount)
+
+	for {
+		select {
+		case <-duration:
+			return
+
+		case <-ticker.C:
+			name = fmt.Sprintf("%04d", pprofIntervalCount)
+			c.Ui.Output(fmt.Sprintf("    Capture pprofInterval %s", name))
+			c.collectPprofs(client, pprofIntervalCount)
+			pprofIntervalCount++
+
+		case <-c.ctx.Done():
+			return
+		}
+	}
+}
+
 // collectPprofs captures the /agent/pprof for each listed node
-func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
+func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
 	for _, n := range c.nodeIDs {
-		c.collectPprof(clientDir, n, client)
+		c.collectPprof(clientDir, n, client, interval)
 	}

 	for _, n := range c.serverIDs {
-		c.collectPprof(serverDir, n, client)
+		c.collectPprof(serverDir, n, client, interval)
 	}
 }

 // collectPprof captures pprof data for the node
-func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
+func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
 	pprofDurationSeconds := int(c.pprofDuration.Seconds())
 	opts := api.PprofOptions{Seconds: pprofDurationSeconds}
 	if path == serverDir {
@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
 	}

 	path = filepath.Join(path, id)
+	filename := fmt.Sprintf("profile_%04d.prof", interval)

 	bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
 	if err != nil {
-		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
+		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
 		if structs.IsErrPermissionDenied(err) {
 			// All Profiles require the same permissions, so we only need to see
 			// one permission failure before we bail.
@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
 			return // only exit on 403
 		}
 	} else {
-		err := c.writeBytes(path, "profile.prof", bs)
+		err := c.writeBytes(path, filename, bs)
 		if err != nil {
 			c.Ui.Error(err.Error())
 		}
--- a/command/operator_debug_test.go
+++ b/command/operator_debug_test.go
@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
 		},
 		{
 			name:            "client1 address - verify no SIGSEGV panic",
-			args:            []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
+			args:            []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
 			expectedCode:    0,
 			expectedOutputs: []string{"Created debug archive"},
 		},
@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
 		// Good
 		{
 			name:         "no region - all servers, all clients",
-			args:         []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:         []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode: 0,
 		},
 		{
 			name:         "region1 - server1 address",
-			args:         []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:         []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode: 0,
 			expectedOutputs: []string{
 				"Region: " + region1 + "\n",
@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
 		},
 		{
 			name:         "region1 - client1 address",
-			args:         []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:         []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode: 0,
 			expectedOutputs: []string{
 				"Region: " + region1 + "\n",
@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
 		},
 		{
 			name:         "region2 - server2 address",
-			args:         []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:         []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode: 0,
 			expectedOutputs: []string{
 				"Region: " + region2 + "\n",
@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
 		},
 		{
 			name:         "region2 - client2 address",
-			args:         []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:         []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode: 0,
 			expectedOutputs: []string{
 				"Region: " + region2 + "\n",
@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
 		// Bad
 		{
 			name:          "invalid region - all servers, all clients",
-			args:          []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
+			args:          []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
 			expectedCode:  1,
 			expectedError: "500 (No path to region)",
 		},
@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
 			args:         []string{"-pprof-duration", "baz"},
 			expectedCode: 1,
 		},
+		{
+			name:         "Fails bad pprof interval",
+			args:         []string{"-pprof-interval", "bar"},
+			expectedCode: 1,
+		},
 		{
 			name:          "Fails bad address",
 			args:          []string{"-address", url + "bogus"},
@ -379,9 +384,11 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
 	// Setup mock UI
 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
+	testDir := t.TempDir()
+	defer os.Remove(testDir)

 	// Debug on the leader and all client nodes
-	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()})
+	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
 	assert.Equal(t, 0, code)

 	// Bad plugin name should be escaped before it reaches the sandbox test
@ -389,7 +396,6 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
 	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")

 	path := cmd.collectDir
-	defer os.Remove(path)

 	var pluginFiles []string
 	for _, pluginName := range cases {
@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
 		"goroutine-debug2.txt",
 		"goroutine.prof",
 		"heap.prof",
-		"profile.prof",
+		"profile_0000.prof",
 		"threadcreate.prof",
 		"trace.prof",
 	}
@ -470,6 +476,8 @@ func TestDebug_CapturedFiles(t *testing.T) {

 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
+	testDir := t.TempDir()
+	defer os.Remove(testDir)

 	duration := 2 * time.Second
 	interval := 750 * time.Millisecond
@ -477,18 +485,15 @@ func TestDebug_CapturedFiles(t *testing.T) {

 	code := cmd.Run([]string{
 		"-address", url,
-		"-output", os.TempDir(),
+		"-output", testDir,
 		"-server-id", serverName,
 		"-node-id", clientID,
 		"-duration", duration.String(),
 		"-interval", interval.String(),
-		"-pprof-duration", "0",
+		"-pprof-duration", "1s",
+		"-pprof-interval", "250ms",
 	})

-	// Get capture directory
-	path := cmd.collectDir
-	defer os.Remove(path)
-
 	// There should be no errors
 	require.Empty(t, ui.ErrorWriter.String())
 	require.Equal(t, 0, code)
@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
 	// Fails existing output
 	format := "2006-01-02-150405Z"
 	stamped := "nomad-debug-" + time.Now().UTC().Format(format)
-	path := filepath.Join(os.TempDir(), stamped)
+	tempDir := t.TempDir()
+	path := filepath.Join(tempDir, stamped)
 	os.MkdirAll(path, 0755)
-	defer os.Remove(path)
+	defer os.Remove(tempDir)

-	code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"})
+	code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
 	require.Equal(t, 2, code)
 }

@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

-	testDir = os.TempDir()
+	testDir = t.TempDir()
+	defer os.Remove(testDir)
 	cmd.collectDir = testDir

 	testFile = "test_nil.json"
 	testPath = filepath.Join(testDir, testFile)
-	defer os.Remove(testPath)

 	// Write nil file at top level of collect directory
 	err := cmd.writeBytes("", testFile, testBytes)
@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
 	var testDir, testFile string
 	var testBytes []byte

-	testDir = os.TempDir()
+	testDir = t.TempDir()
 	defer os.Remove(testDir)

 	testFile = "testing.json"
@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
 	c.consul = ce

 	// Setup capture directory
-	testDir := os.TempDir()
+	testDir := t.TempDir()
 	defer os.Remove(testDir)
 	c.collectDir = testDir

@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
 	c.vault = ve

 	// Set capture directory
-	testDir := os.TempDir()
+	testDir := t.TempDir()
 	defer os.Remove(testDir)
 	c.collectDir = testDir

--- a/website/content/docs/commands/operator/debug.mdx
+++ b/website/content/docs/commands/operator/debug.mdx
@ -66,6 +66,10 @@ true.

 - `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.

+- `-pprof-interval=<pprof-interval>`: The interval between pprof
+    collections. Set interval equal to duration to capture a single
+    snapshot. Defaults to 250ms.
+
 - `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
  monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
  "all". Defaults to `all`.
@ -75,7 +79,7 @@ true.
  leadership, it may be necessary to get the configuration from a non-leader
  server.

- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event 
+- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
  stream capture. Filter by comma delimited list of topic filters or "all".
  Defaults to "none" (disabled).  Refer to the [Events API](/api-docs/events) for
  additional detail.