debug: add version constraint to avoid pprof panic (#12807)
This commit is contained in:
parent
0830e3c787
commit
97cf204c00
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
|
||||
```
|
|
@ -23,6 +23,7 @@ import (
|
|||
|
||||
"github.com/hashicorp/go-cleanhttp"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
goversion "github.com/hashicorp/go-version"
|
||||
"github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/api/contexts"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
|
@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
|
|||
cancel context.CancelFunc
|
||||
opts *api.QueryOptions
|
||||
verbose bool
|
||||
members *api.ServerMembers
|
||||
nodes []*api.NodeListStub
|
||||
}
|
||||
|
||||
const (
|
||||
userAgent = "nomad operator debug"
|
||||
clusterDir = "cluster"
|
||||
clientDir = "client"
|
||||
serverDir = "server"
|
||||
intervalDir = "interval"
|
||||
userAgent = "nomad operator debug"
|
||||
clusterDir = "cluster"
|
||||
clientDir = "client"
|
||||
serverDir = "server"
|
||||
intervalDir = "interval"
|
||||
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
|
||||
)
|
||||
|
||||
func (c *OperatorDebugCommand) Help() string {
|
||||
|
@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
AuthToken: c.Meta.token,
|
||||
}
|
||||
|
||||
// Get complete list of client nodes
|
||||
c.nodes, _, err = client.Nodes().List(c.queryOpts())
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Write nodes to file
|
||||
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)
|
||||
|
||||
// Search all nodes If a node class is specified without a list of node id prefixes
|
||||
if c.nodeClass != "" && nodeIDs == "" {
|
||||
nodeIDs = "all"
|
||||
|
@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
}
|
||||
|
||||
// Resolve servers
|
||||
members, err := client.Agent().MembersOpts(c.queryOpts())
|
||||
c.members, err = client.Agent().MembersOpts(c.queryOpts())
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Write complete list of server members to file
|
||||
c.writeJSON(clusterDir, "members.json", members, err)
|
||||
c.writeJSON(clusterDir, "members.json", c.members, err)
|
||||
|
||||
// Filter for servers matching criteria
|
||||
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
|
||||
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
|
||||
return 1
|
||||
|
@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||
serversFound := 0
|
||||
serverCaptureCount := 0
|
||||
|
||||
if members != nil {
|
||||
serversFound = len(members.Members)
|
||||
if c.members != nil {
|
||||
serversFound = len(c.members.Members)
|
||||
}
|
||||
if c.serverIDs != nil {
|
||||
serverCaptureCount = len(c.serverIDs)
|
||||
|
@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
|
|||
|
||||
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
||||
|
||||
pprofNodeIDs := []string{}
|
||||
pprofServerIDs := []string{}
|
||||
|
||||
// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
|
||||
for _, serverID := range c.serverIDs {
|
||||
version := c.getNomadVersion(serverID, "")
|
||||
err := checkVersion(version, minimumVersionPprofConstraint)
|
||||
if err != nil {
|
||||
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
|
||||
}
|
||||
pprofServerIDs = append(pprofServerIDs, serverID)
|
||||
}
|
||||
|
||||
for _, nodeID := range c.nodeIDs {
|
||||
version := c.getNomadVersion("", nodeID)
|
||||
err := checkVersion(version, minimumVersionPprofConstraint)
|
||||
if err != nil {
|
||||
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
|
||||
}
|
||||
pprofNodeIDs = append(pprofNodeIDs, nodeID)
|
||||
}
|
||||
|
||||
// Take the first set of pprofs synchronously...
|
||||
c.Ui.Output(" Capture pprofInterval 0000")
|
||||
c.collectPprofs(client, 0)
|
||||
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
|
||||
if c.pprofInterval == c.pprofDuration {
|
||||
return
|
||||
}
|
||||
|
@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
|||
return
|
||||
case <-timer.C:
|
||||
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
|
||||
c.collectPprofs(client, pprofIntervalCount)
|
||||
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
|
||||
timer.Reset(c.pprofInterval)
|
||||
pprofIntervalCount++
|
||||
}
|
||||
|
@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
|||
}
|
||||
|
||||
// collectPprofs captures the /agent/pprof for each listed node
|
||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
|
||||
for _, n := range c.nodeIDs {
|
||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
|
||||
for _, n := range nodeIDs {
|
||||
c.collectPprof(clientDir, n, client, interval)
|
||||
}
|
||||
|
||||
for _, n := range c.serverIDs {
|
||||
for _, n := range serverIDs {
|
||||
c.collectPprof(serverDir, n, client, interval)
|
||||
}
|
||||
}
|
||||
|
@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
|
|||
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
|
||||
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
|
||||
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
|
||||
|
||||
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
|
||||
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
|
||||
|
||||
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
|
||||
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
|
||||
}
|
||||
|
||||
// savePprofProfile retrieves a pprof profile and writes to disk
|
||||
|
@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
|
|||
const redirectErr string = `invalid character '<' looking for beginning of value`
|
||||
return strings.Contains(err.Error(), redirectErr)
|
||||
}
|
||||
|
||||
// getNomadVersion fetches the version of Nomad running on a given server/client node ID
|
||||
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
|
||||
if serverID == "" && nodeID == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
version := ""
|
||||
if serverID != "" {
|
||||
for _, server := range c.members.Members {
|
||||
// Raft v2 server
|
||||
if server.Name == serverID {
|
||||
version = server.Tags["build"]
|
||||
}
|
||||
|
||||
// Raft v3 server
|
||||
if server.Tags["id"] == serverID {
|
||||
version = server.Tags["version"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if nodeID != "" {
|
||||
for _, node := range c.nodes {
|
||||
if node.ID == nodeID {
|
||||
version = node.Version
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return version
|
||||
}
|
||||
|
||||
// checkVersion verifies that version satisfies the constraint
|
||||
func checkVersion(version string, versionConstraint string) error {
|
||||
v, err := goversion.NewVersion(version)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error: %v", err)
|
||||
}
|
||||
|
||||
c, err := goversion.NewConstraint(versionConstraint)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error: %v", err)
|
||||
}
|
||||
|
||||
if !c.Check(v) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
|
||||
}
|
||||
|
|
|
@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
|
|||
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
|
||||
}
|
||||
|
||||
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
|
||||
// filtered by the version constraint.
|
||||
func TestDebug_PprofVersionCheck(t *testing.T) {
|
||||
cases := []struct {
|
||||
version string
|
||||
errMsg string
|
||||
}{
|
||||
{"0.8.7", ""},
|
||||
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
|
||||
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
|
||||
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
|
||||
{"0.11.3", ""},
|
||||
{"0.11.3+ent", ""},
|
||||
{"0.12.0", ""},
|
||||
{"1.3.0", ""},
|
||||
{"foo.bar", "error: Malformed version: foo.bar"},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.version, func(t *testing.T) {
|
||||
err := checkVersion(tc.version, minimumVersionPprofConstraint)
|
||||
if tc.errMsg == "" {
|
||||
require.NoError(t, err, "expected no error from %s", tc.version)
|
||||
} else {
|
||||
require.EqualError(t, err, tc.errMsg)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDebug_StringToSlice(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
|
|
Loading…
Reference in New Issue