command/operator_debug: add pprof interval (#11938)

This commit is contained in:
Danish Prakash 2022-04-05 00:54:12 +05:30 committed by GitHub
parent a1f1294dc4
commit e7e8ce212e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 96 additions and 33 deletions

3
.changelog/11938.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
cli: added `-pprof-interval` to `nomad operator debug` command
```

View File

@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
collectDir string collectDir string
duration time.Duration duration time.Duration
interval time.Duration interval time.Duration
pprofInterval time.Duration
pprofDuration time.Duration pprofDuration time.Duration
logLevel string logLevel string
maxNodes int maxNodes int
@ -184,6 +185,10 @@ Debug Options:
-pprof-duration=<duration> -pprof-duration=<duration>
Duration for pprof collection. Defaults to 1s. Duration for pprof collection. Defaults to 1s.
-pprof-interval=<pprof-interval>
The interval between pprof collections. Set interval equal to
duration to capture a single snapshot. Defaults to 250ms.
-server-id=<server1>,<server2> -server-id=<server1>,<server2>
Comma separated list of Nomad server names to monitor for logs, API Comma separated list of Nomad server names to monitor for logs, API
outputs, and pprof profiles. Accepts server names, "leader", or "all". outputs, and pprof profiles. Accepts server names, "leader", or "all".
@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) } flags.Usage = func() { c.Ui.Output(c.Help()) }
var duration, interval, output, pprofDuration, eventTopic string var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64 var eventIndex int64
var nodeIDs, serverIDs string var nodeIDs, serverIDs string
var allowStale bool var allowStale bool
@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.BoolVar(&allowStale, "stale", false, "") flags.BoolVar(&allowStale, "stale", false, "")
flags.StringVar(&output, "output", "", "") flags.StringVar(&output, "output", "", "")
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
flags.BoolVar(&c.verbose, "verbose", false, "") flags.BoolVar(&c.verbose, "verbose", false, "")
c.consul = &external{tls: &api.TLSConfig{}} c.consul = &external{tls: &api.TLSConfig{}}
@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1 return 1
} }
// Parse the pprof capture interval
pi, err := time.ParseDuration(pprofInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
return 1
}
c.pprofInterval = pi
// Parse the pprof capture duration // Parse the pprof capture duration
pd, err := time.ParseDuration(pprofDuration) pd, err := time.ParseDuration(pprofDuration)
if err != nil { if err != nil {
@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
} }
c.pprofDuration = pd c.pprofDuration = pd
// Validate pprof interval
if pi.Seconds() > pd.Seconds() {
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
return 1
}
// Parse event stream topic filter // Parse event stream topic filter
t, err := topicsFromString(eventTopic) t, err := topicsFromString(eventTopic)
if err != nil { if err != nil {
@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
} }
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
if c.pprofDuration.Seconds() != 1 { if c.pprofDuration.Seconds() != 1 {
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration)) c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
} }
@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
c.collectVault(clusterDir, vaultAddr) c.collectVault(clusterDir, vaultAddr)
c.collectAgentHosts(client) c.collectAgentHosts(client)
c.collectPprofs(client) go c.collectPeriodicPprofs(client)
c.collectPeriodic(client) c.collectPeriodic(client)
@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
c.writeJSON(path, "agent-host.json", host, err) c.writeJSON(path, "agent-host.json", host, err)
} }
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
duration := time.After(c.pprofDuration)
// Create a ticker to execute on every interval ticks
ticker := time.NewTicker(c.pprofInterval)
var pprofIntervalCount int
var name string
// Additionally, an out of loop execute to imitate first tick
c.collectPprofs(client, pprofIntervalCount)
for {
select {
case <-duration:
return
case <-ticker.C:
name = fmt.Sprintf("%04d", pprofIntervalCount)
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
c.collectPprofs(client, pprofIntervalCount)
pprofIntervalCount++
case <-c.ctx.Done():
return
}
}
}
// collectPprofs captures the /agent/pprof for each listed node // collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs { for _, n := range c.nodeIDs {
c.collectPprof(clientDir, n, client) c.collectPprof(clientDir, n, client, interval)
} }
for _, n := range c.serverIDs { for _, n := range c.serverIDs {
c.collectPprof(serverDir, n, client) c.collectPprof(serverDir, n, client, interval)
} }
} }
// collectPprof captures pprof data for the node // collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) { func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
pprofDurationSeconds := int(c.pprofDuration.Seconds()) pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds} opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == serverDir { if path == serverDir {
@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
} }
path = filepath.Join(path, id) path = filepath.Join(path, id)
filename := fmt.Sprintf("profile_%04d.prof", interval)
bs, err := client.Agent().CPUProfile(opts, c.queryOpts()) bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
if err != nil { if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err)) c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
if structs.IsErrPermissionDenied(err) { if structs.IsErrPermissionDenied(err) {
// All Profiles require the same permissions, so we only need to see // All Profiles require the same permissions, so we only need to see
// one permission failure before we bail. // one permission failure before we bail.
@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
return // only exit on 403 return // only exit on 403
} }
} else { } else {
err := c.writeBytes(path, "profile.prof", bs) err := c.writeBytes(path, filename, bs)
if err != nil { if err != nil {
c.Ui.Error(err.Error()) c.Ui.Error(err.Error())
} }

View File

@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
}, },
{ {
name: "client1 address - verify no SIGSEGV panic", name: "client1 address - verify no SIGSEGV panic",
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
expectedCode: 0, expectedCode: 0,
expectedOutputs: []string{"Created debug archive"}, expectedOutputs: []string{"Created debug archive"},
}, },
@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
// Good // Good
{ {
name: "no region - all servers, all clients", name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0, expectedCode: 0,
}, },
{ {
name: "region1 - server1 address", name: "region1 - server1 address",
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0, expectedCode: 0,
expectedOutputs: []string{ expectedOutputs: []string{
"Region: " + region1 + "\n", "Region: " + region1 + "\n",
@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
}, },
{ {
name: "region1 - client1 address", name: "region1 - client1 address",
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0, expectedCode: 0,
expectedOutputs: []string{ expectedOutputs: []string{
"Region: " + region1 + "\n", "Region: " + region1 + "\n",
@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
}, },
{ {
name: "region2 - server2 address", name: "region2 - server2 address",
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0, expectedCode: 0,
expectedOutputs: []string{ expectedOutputs: []string{
"Region: " + region2 + "\n", "Region: " + region2 + "\n",
@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
}, },
{ {
name: "region2 - client2 address", name: "region2 - client2 address",
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0, expectedCode: 0,
expectedOutputs: []string{ expectedOutputs: []string{
"Region: " + region2 + "\n", "Region: " + region2 + "\n",
@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
// Bad // Bad
{ {
name: "invalid region - all servers, all clients", name: "invalid region - all servers, all clients",
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"}, args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 1, expectedCode: 1,
expectedError: "500 (No path to region)", expectedError: "500 (No path to region)",
}, },
@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
args: []string{"-pprof-duration", "baz"}, args: []string{"-pprof-duration", "baz"},
expectedCode: 1, expectedCode: 1,
}, },
{
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
},
{ {
name: "Fails bad address", name: "Fails bad address",
args: []string{"-address", url + "bogus"}, args: []string{"-address", url + "bogus"},
@ -379,9 +384,11 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
// Setup mock UI // Setup mock UI
ui := cli.NewMockUi() ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
// Debug on the leader and all client nodes // Debug on the leader and all client nodes
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
assert.Equal(t, 0, code) assert.Equal(t, 0, code)
// Bad plugin name should be escaped before it reaches the sandbox test // Bad plugin name should be escaped before it reaches the sandbox test
@ -389,7 +396,6 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
require.Contains(t, ui.OutputWriter.String(), "Starting debugger") require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
path := cmd.collectDir path := cmd.collectDir
defer os.Remove(path)
var pluginFiles []string var pluginFiles []string
for _, pluginName := range cases { for _, pluginName := range cases {
@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
"goroutine-debug2.txt", "goroutine-debug2.txt",
"goroutine.prof", "goroutine.prof",
"heap.prof", "heap.prof",
"profile.prof", "profile_0000.prof",
"threadcreate.prof", "threadcreate.prof",
"trace.prof", "trace.prof",
} }
@ -470,6 +476,8 @@ func TestDebug_CapturedFiles(t *testing.T) {
ui := cli.NewMockUi() ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
duration := 2 * time.Second duration := 2 * time.Second
interval := 750 * time.Millisecond interval := 750 * time.Millisecond
@ -477,18 +485,15 @@ func TestDebug_CapturedFiles(t *testing.T) {
code := cmd.Run([]string{ code := cmd.Run([]string{
"-address", url, "-address", url,
"-output", os.TempDir(), "-output", testDir,
"-server-id", serverName, "-server-id", serverName,
"-node-id", clientID, "-node-id", clientID,
"-duration", duration.String(), "-duration", duration.String(),
"-interval", interval.String(), "-interval", interval.String(),
"-pprof-duration", "0", "-pprof-duration", "1s",
"-pprof-interval", "250ms",
}) })
// Get capture directory
path := cmd.collectDir
defer os.Remove(path)
// There should be no errors // There should be no errors
require.Empty(t, ui.ErrorWriter.String()) require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code) require.Equal(t, 0, code)
@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
// Fails existing output // Fails existing output
format := "2006-01-02-150405Z" format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format) stamped := "nomad-debug-" + time.Now().UTC().Format(format)
path := filepath.Join(os.TempDir(), stamped) tempDir := t.TempDir()
path := filepath.Join(tempDir, stamped)
os.MkdirAll(path, 0755) os.MkdirAll(path, 0755)
defer os.Remove(path) defer os.Remove(tempDir)
code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"}) code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
require.Equal(t, 2, code) require.Equal(t, 2, code)
} }
@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
ui := cli.NewMockUi() ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir = os.TempDir() testDir = t.TempDir()
defer os.Remove(testDir)
cmd.collectDir = testDir cmd.collectDir = testDir
testFile = "test_nil.json" testFile = "test_nil.json"
testPath = filepath.Join(testDir, testFile) testPath = filepath.Join(testDir, testFile)
defer os.Remove(testPath)
// Write nil file at top level of collect directory // Write nil file at top level of collect directory
err := cmd.writeBytes("", testFile, testBytes) err := cmd.writeBytes("", testFile, testBytes)
@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
var testDir, testFile string var testDir, testFile string
var testBytes []byte var testBytes []byte
testDir = os.TempDir() testDir = t.TempDir()
defer os.Remove(testDir) defer os.Remove(testDir)
testFile = "testing.json" testFile = "testing.json"
@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
c.consul = ce c.consul = ce
// Setup capture directory // Setup capture directory
testDir := os.TempDir() testDir := t.TempDir()
defer os.Remove(testDir) defer os.Remove(testDir)
c.collectDir = testDir c.collectDir = testDir
@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
c.vault = ve c.vault = ve
// Set capture directory // Set capture directory
testDir := os.TempDir() testDir := t.TempDir()
defer os.Remove(testDir) defer os.Remove(testDir)
c.collectDir = testDir c.collectDir = testDir

View File

@ -66,6 +66,10 @@ true.
- `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s. - `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.
- `-pprof-interval=<pprof-interval>`: The interval between pprof
collections. Set interval equal to duration to capture a single
snapshot. Defaults to 250ms.
- `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to - `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
"all". Defaults to `all`. "all". Defaults to `all`.
@ -75,7 +79,7 @@ true.
leadership, it may be necessary to get the configuration from a non-leader leadership, it may be necessary to get the configuration from a non-leader
server. server.
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event - `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
stream capture. Filter by comma delimited list of topic filters or "all". stream capture. Filter by comma delimited list of topic filters or "all".
Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for
additional detail. additional detail.