command/operator_debug: add pprof interval (#11938)

This commit is contained in:
Danish Prakash 2022-04-05 00:54:12 +05:30 committed by GitHub
parent a1f1294dc4
commit e7e8ce212e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 96 additions and 33 deletions

3
.changelog/11938.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
cli: added `-pprof-interval` to `nomad operator debug` command
```

View File

@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
collectDir string
duration time.Duration
interval time.Duration
pprofInterval time.Duration
pprofDuration time.Duration
logLevel string
maxNodes int
@ -184,6 +185,10 @@ Debug Options:
-pprof-duration=<duration>
Duration for pprof collection. Defaults to 1s.
-pprof-interval=<pprof-interval>
The interval between pprof collections. Set interval equal to
duration to capture a single snapshot. Defaults to 250ms.
-server-id=<server1>,<server2>
Comma separated list of Nomad server names to monitor for logs, API
outputs, and pprof profiles. Accepts server names, "leader", or "all".
@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
var duration, interval, output, pprofDuration, eventTopic string
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64
var nodeIDs, serverIDs string
var allowStale bool
@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.BoolVar(&allowStale, "stale", false, "")
flags.StringVar(&output, "output", "", "")
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
flags.BoolVar(&c.verbose, "verbose", false, "")
c.consul = &external{tls: &api.TLSConfig{}}
@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1
}
// Parse the pprof capture interval
pi, err := time.ParseDuration(pprofInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
return 1
}
c.pprofInterval = pi
// Parse the pprof capture duration
pd, err := time.ParseDuration(pprofDuration)
if err != nil {
@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.pprofDuration = pd
// Validate pprof interval
if pi.Seconds() > pd.Seconds() {
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
return 1
}
// Parse event stream topic filter
t, err := topicsFromString(eventTopic)
if err != nil {
@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
if c.pprofDuration.Seconds() != 1 {
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
}
@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
c.collectVault(clusterDir, vaultAddr)
c.collectAgentHosts(client)
c.collectPprofs(client)
go c.collectPeriodicPprofs(client)
c.collectPeriodic(client)
@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
c.writeJSON(path, "agent-host.json", host, err)
}
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
duration := time.After(c.pprofDuration)
// Create a ticker to execute on every interval ticks
ticker := time.NewTicker(c.pprofInterval)
var pprofIntervalCount int
var name string
// Additionally, an out of loop execute to imitate first tick
c.collectPprofs(client, pprofIntervalCount)
for {
select {
case <-duration:
return
case <-ticker.C:
name = fmt.Sprintf("%04d", pprofIntervalCount)
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
c.collectPprofs(client, pprofIntervalCount)
pprofIntervalCount++
case <-c.ctx.Done():
return
}
}
}
// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
c.collectPprof(clientDir, n, client)
c.collectPprof(clientDir, n, client, interval)
}
for _, n := range c.serverIDs {
c.collectPprof(serverDir, n, client)
c.collectPprof(serverDir, n, client, interval)
}
}
// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == serverDir {
@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
}
path = filepath.Join(path, id)
filename := fmt.Sprintf("profile_%04d.prof", interval)
bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
if structs.IsErrPermissionDenied(err) {
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
return // only exit on 403
}
} else {
err := c.writeBytes(path, "profile.prof", bs)
err := c.writeBytes(path, filename, bs)
if err != nil {
c.Ui.Error(err.Error())
}

View File

@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
},
{
name: "client1 address - verify no SIGSEGV panic",
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
// Good
{
name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
},
{
name: "region1 - server1 address",
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region1 - client1 address",
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - server2 address",
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - client2 address",
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
// Bad
{
name: "invalid region - all servers, all clients",
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 1,
expectedError: "500 (No path to region)",
},
@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
args: []string{"-pprof-duration", "baz"},
expectedCode: 1,
},
{
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
},
{
name: "Fails bad address",
args: []string{"-address", url + "bogus"},
@ -379,9 +384,11 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
// Debug on the leader and all client nodes
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()})
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
assert.Equal(t, 0, code)
// Bad plugin name should be escaped before it reaches the sandbox test
@ -389,7 +396,6 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
path := cmd.collectDir
defer os.Remove(path)
var pluginFiles []string
for _, pluginName := range cases {
@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile.prof",
"profile_0000.prof",
"threadcreate.prof",
"trace.prof",
}
@ -470,6 +476,8 @@ func TestDebug_CapturedFiles(t *testing.T) {
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)
duration := 2 * time.Second
interval := 750 * time.Millisecond
@ -477,18 +485,15 @@ func TestDebug_CapturedFiles(t *testing.T) {
code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-output", testDir,
"-server-id", serverName,
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
"-pprof-duration", "0",
"-pprof-duration", "1s",
"-pprof-interval", "250ms",
})
// Get capture directory
path := cmd.collectDir
defer os.Remove(path)
// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
// Fails existing output
format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
path := filepath.Join(os.TempDir(), stamped)
tempDir := t.TempDir()
path := filepath.Join(tempDir, stamped)
os.MkdirAll(path, 0755)
defer os.Remove(path)
defer os.Remove(tempDir)
code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"})
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
require.Equal(t, 2, code)
}
@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)
cmd.collectDir = testDir
testFile = "test_nil.json"
testPath = filepath.Join(testDir, testFile)
defer os.Remove(testPath)
// Write nil file at top level of collect directory
err := cmd.writeBytes("", testFile, testBytes)
@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
var testDir, testFile string
var testBytes []byte
testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)
testFile = "testing.json"
@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
c.consul = ce
// Setup capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir
@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
c.vault = ve
// Set capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir

View File

@ -66,6 +66,10 @@ true.
- `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.
- `-pprof-interval=<pprof-interval>`: The interval between pprof
collections. Set interval equal to duration to capture a single
snapshot. Defaults to 250ms.
- `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
"all". Defaults to `all`.
@ -75,7 +79,7 @@ true.
leadership, it may be necessary to get the configuration from a non-leader
server.
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
stream capture. Filter by comma delimited list of topic filters or "all".
Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for
additional detail.