command/operator_debug: add pprof interval (#11938)
This commit is contained in:
parent
a1f1294dc4
commit
e7e8ce212e
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
cli: added `-pprof-interval` to `nomad operator debug` command
|
||||
```
|
|
@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
|
|||
collectDir string
|
||||
duration time.Duration
|
||||
interval time.Duration
|
||||
pprofInterval time.Duration
|
||||
pprofDuration time.Duration
|
||||
logLevel string
|
||||
maxNodes int
|
||||
|
@ -184,6 +185,10 @@ Debug Options:
|
|||
-pprof-duration=<duration>
|
||||
Duration for pprof collection. Defaults to 1s.
|
||||
|
||||
-pprof-interval=<pprof-interval>
|
||||
The interval between pprof collections. Set interval equal to
|
||||
duration to capture a single snapshot. Defaults to 250ms.
|
||||
|
||||
-server-id=<server1>,<server2>
|
||||
Comma separated list of Nomad server names to monitor for logs, API
|
||||
outputs, and pprof profiles. Accepts server names, "leader", or "all".
|
||||
|
@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
|
||||
flags.Usage = func() { c.Ui.Output(c.Help()) }
|
||||
|
||||
var duration, interval, output, pprofDuration, eventTopic string
|
||||
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
|
||||
var eventIndex int64
|
||||
var nodeIDs, serverIDs string
|
||||
var allowStale bool
|
||||
|
@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
flags.BoolVar(&allowStale, "stale", false, "")
|
||||
flags.StringVar(&output, "output", "", "")
|
||||
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
|
||||
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
|
||||
flags.BoolVar(&c.verbose, "verbose", false, "")
|
||||
|
||||
c.consul = &external{tls: &api.TLSConfig{}}
|
||||
|
@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
return 1
|
||||
}
|
||||
|
||||
// Parse the pprof capture interval
|
||||
pi, err := time.ParseDuration(pprofInterval)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
|
||||
return 1
|
||||
}
|
||||
c.pprofInterval = pi
|
||||
|
||||
// Parse the pprof capture duration
|
||||
pd, err := time.ParseDuration(pprofDuration)
|
||||
if err != nil {
|
||||
|
@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
}
|
||||
c.pprofDuration = pd
|
||||
|
||||
// Validate pprof interval
|
||||
if pi.Seconds() > pd.Seconds() {
|
||||
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Parse event stream topic filter
|
||||
t, err := topicsFromString(eventTopic)
|
||||
if err != nil {
|
||||
|
@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
}
|
||||
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
|
||||
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
|
||||
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
|
||||
if c.pprofDuration.Seconds() != 1 {
|
||||
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
|
||||
}
|
||||
|
@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
|
|||
c.collectVault(clusterDir, vaultAddr)
|
||||
|
||||
c.collectAgentHosts(client)
|
||||
c.collectPprofs(client)
|
||||
go c.collectPeriodicPprofs(client)
|
||||
|
||||
c.collectPeriodic(client)
|
||||
|
||||
|
@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
|
|||
c.writeJSON(path, "agent-host.json", host, err)
|
||||
}
|
||||
|
||||
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
||||
duration := time.After(c.pprofDuration)
|
||||
// Create a ticker to execute on every interval ticks
|
||||
ticker := time.NewTicker(c.pprofInterval)
|
||||
|
||||
var pprofIntervalCount int
|
||||
var name string
|
||||
|
||||
// Additionally, an out of loop execute to imitate first tick
|
||||
c.collectPprofs(client, pprofIntervalCount)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-duration:
|
||||
return
|
||||
|
||||
case <-ticker.C:
|
||||
name = fmt.Sprintf("%04d", pprofIntervalCount)
|
||||
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
|
||||
c.collectPprofs(client, pprofIntervalCount)
|
||||
pprofIntervalCount++
|
||||
|
||||
case <-c.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collectPprofs captures the /agent/pprof for each listed node
|
||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
|
||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
|
||||
for _, n := range c.nodeIDs {
|
||||
c.collectPprof(clientDir, n, client)
|
||||
c.collectPprof(clientDir, n, client, interval)
|
||||
}
|
||||
|
||||
for _, n := range c.serverIDs {
|
||||
c.collectPprof(serverDir, n, client)
|
||||
c.collectPprof(serverDir, n, client, interval)
|
||||
}
|
||||
}
|
||||
|
||||
// collectPprof captures pprof data for the node
|
||||
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
|
||||
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
|
||||
pprofDurationSeconds := int(c.pprofDuration.Seconds())
|
||||
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
|
||||
if path == serverDir {
|
||||
|
@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
|
|||
}
|
||||
|
||||
path = filepath.Join(path, id)
|
||||
filename := fmt.Sprintf("profile_%04d.prof", interval)
|
||||
|
||||
bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
|
||||
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
|
||||
if structs.IsErrPermissionDenied(err) {
|
||||
// All Profiles require the same permissions, so we only need to see
|
||||
// one permission failure before we bail.
|
||||
|
@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
|
|||
return // only exit on 403
|
||||
}
|
||||
} else {
|
||||
err := c.writeBytes(path, "profile.prof", bs)
|
||||
err := c.writeBytes(path, filename, bs)
|
||||
if err != nil {
|
||||
c.Ui.Error(err.Error())
|
||||
}
|
||||
|
|
|
@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
|
|||
},
|
||||
{
|
||||
name: "client1 address - verify no SIGSEGV panic",
|
||||
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
|
||||
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
|
||||
expectedCode: 0,
|
||||
expectedOutputs: []string{"Created debug archive"},
|
||||
},
|
||||
|
@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
|
|||
// Good
|
||||
{
|
||||
name: "no region - all servers, all clients",
|
||||
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 0,
|
||||
},
|
||||
{
|
||||
name: "region1 - server1 address",
|
||||
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 0,
|
||||
expectedOutputs: []string{
|
||||
"Region: " + region1 + "\n",
|
||||
|
@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
|
|||
},
|
||||
{
|
||||
name: "region1 - client1 address",
|
||||
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 0,
|
||||
expectedOutputs: []string{
|
||||
"Region: " + region1 + "\n",
|
||||
|
@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
|
|||
},
|
||||
{
|
||||
name: "region2 - server2 address",
|
||||
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 0,
|
||||
expectedOutputs: []string{
|
||||
"Region: " + region2 + "\n",
|
||||
|
@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
|
|||
},
|
||||
{
|
||||
name: "region2 - client2 address",
|
||||
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 0,
|
||||
expectedOutputs: []string{
|
||||
"Region: " + region2 + "\n",
|
||||
|
@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
|
|||
// Bad
|
||||
{
|
||||
name: "invalid region - all servers, all clients",
|
||||
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
|
||||
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
|
||||
expectedCode: 1,
|
||||
expectedError: "500 (No path to region)",
|
||||
},
|
||||
|
@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
|
|||
args: []string{"-pprof-duration", "baz"},
|
||||
expectedCode: 1,
|
||||
},
|
||||
{
|
||||
name: "Fails bad pprof interval",
|
||||
args: []string{"-pprof-interval", "bar"},
|
||||
expectedCode: 1,
|
||||
},
|
||||
{
|
||||
name: "Fails bad address",
|
||||
args: []string{"-address", url + "bogus"},
|
||||
|
@ -379,9 +384,11 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
|
|||
// Setup mock UI
|
||||
ui := cli.NewMockUi()
|
||||
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
||||
testDir := t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
|
||||
// Debug on the leader and all client nodes
|
||||
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()})
|
||||
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
|
||||
assert.Equal(t, 0, code)
|
||||
|
||||
// Bad plugin name should be escaped before it reaches the sandbox test
|
||||
|
@ -389,7 +396,6 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
|
|||
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
|
||||
|
||||
path := cmd.collectDir
|
||||
defer os.Remove(path)
|
||||
|
||||
var pluginFiles []string
|
||||
for _, pluginName := range cases {
|
||||
|
@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
|
|||
"goroutine-debug2.txt",
|
||||
"goroutine.prof",
|
||||
"heap.prof",
|
||||
"profile.prof",
|
||||
"profile_0000.prof",
|
||||
"threadcreate.prof",
|
||||
"trace.prof",
|
||||
}
|
||||
|
@ -470,6 +476,8 @@ func TestDebug_CapturedFiles(t *testing.T) {
|
|||
|
||||
ui := cli.NewMockUi()
|
||||
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
||||
testDir := t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
|
||||
duration := 2 * time.Second
|
||||
interval := 750 * time.Millisecond
|
||||
|
@ -477,18 +485,15 @@ func TestDebug_CapturedFiles(t *testing.T) {
|
|||
|
||||
code := cmd.Run([]string{
|
||||
"-address", url,
|
||||
"-output", os.TempDir(),
|
||||
"-output", testDir,
|
||||
"-server-id", serverName,
|
||||
"-node-id", clientID,
|
||||
"-duration", duration.String(),
|
||||
"-interval", interval.String(),
|
||||
"-pprof-duration", "0",
|
||||
"-pprof-duration", "1s",
|
||||
"-pprof-interval", "250ms",
|
||||
})
|
||||
|
||||
// Get capture directory
|
||||
path := cmd.collectDir
|
||||
defer os.Remove(path)
|
||||
|
||||
// There should be no errors
|
||||
require.Empty(t, ui.ErrorWriter.String())
|
||||
require.Equal(t, 0, code)
|
||||
|
@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
|
|||
// Fails existing output
|
||||
format := "2006-01-02-150405Z"
|
||||
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
|
||||
path := filepath.Join(os.TempDir(), stamped)
|
||||
tempDir := t.TempDir()
|
||||
path := filepath.Join(tempDir, stamped)
|
||||
os.MkdirAll(path, 0755)
|
||||
defer os.Remove(path)
|
||||
defer os.Remove(tempDir)
|
||||
|
||||
code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"})
|
||||
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
|
||||
require.Equal(t, 2, code)
|
||||
}
|
||||
|
||||
|
@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
|
|||
ui := cli.NewMockUi()
|
||||
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
|
||||
|
||||
testDir = os.TempDir()
|
||||
testDir = t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
cmd.collectDir = testDir
|
||||
|
||||
testFile = "test_nil.json"
|
||||
testPath = filepath.Join(testDir, testFile)
|
||||
defer os.Remove(testPath)
|
||||
|
||||
// Write nil file at top level of collect directory
|
||||
err := cmd.writeBytes("", testFile, testBytes)
|
||||
|
@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
|
|||
var testDir, testFile string
|
||||
var testBytes []byte
|
||||
|
||||
testDir = os.TempDir()
|
||||
testDir = t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
|
||||
testFile = "testing.json"
|
||||
|
@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
|
|||
c.consul = ce
|
||||
|
||||
// Setup capture directory
|
||||
testDir := os.TempDir()
|
||||
testDir := t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
c.collectDir = testDir
|
||||
|
||||
|
@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
|
|||
c.vault = ve
|
||||
|
||||
// Set capture directory
|
||||
testDir := os.TempDir()
|
||||
testDir := t.TempDir()
|
||||
defer os.Remove(testDir)
|
||||
c.collectDir = testDir
|
||||
|
||||
|
|
|
@ -66,6 +66,10 @@ true.
|
|||
|
||||
- `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.
|
||||
|
||||
- `-pprof-interval=<pprof-interval>`: The interval between pprof
|
||||
collections. Set interval equal to duration to capture a single
|
||||
snapshot. Defaults to 250ms.
|
||||
|
||||
- `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
|
||||
monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
|
||||
"all". Defaults to `all`.
|
||||
|
@ -75,7 +79,7 @@ true.
|
|||
leadership, it may be necessary to get the configuration from a non-leader
|
||||
server.
|
||||
|
||||
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
|
||||
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
|
||||
stream capture. Filter by comma delimited list of topic filters or "all".
|
||||
Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for
|
||||
additional detail.
|
||||
|
|
Loading…
Reference in New Issue