debug: add version constraint to avoid pprof panic (#12807)

This commit is contained in:
Dave May 2022-04-28 13:18:55 -04:00 committed by GitHub
parent 0830e3c787
commit 97cf204c00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 134 additions and 21 deletions

3
.changelog/12807.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
```

View File

@ -23,6 +23,7 @@ import (
"github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/go-multierror"
goversion "github.com/hashicorp/go-version"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/api/contexts"
"github.com/hashicorp/nomad/helper"
@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
cancel context.CancelFunc
opts *api.QueryOptions
verbose bool
members *api.ServerMembers
nodes []*api.NodeListStub
}
const (
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
)
func (c *OperatorDebugCommand) Help() string {
@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
AuthToken: c.Meta.token,
}
// Get complete list of client nodes
c.nodes, _, err = client.Nodes().List(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
return 1
}
// Write nodes to file
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)
// Search all nodes If a node class is specified without a list of node id prefixes
if c.nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
// Resolve servers
members, err := client.Agent().MembersOpts(c.queryOpts())
c.members, err = client.Agent().MembersOpts(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
return 1
}
// Write complete list of server members to file
c.writeJSON(clusterDir, "members.json", members, err)
c.writeJSON(clusterDir, "members.json", c.members, err)
// Filter for servers matching criteria
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
return 1
@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
serversFound := 0
serverCaptureCount := 0
if members != nil {
serversFound = len(members.Members)
if c.members != nil {
serversFound = len(c.members.Members)
}
if c.serverIDs != nil {
serverCaptureCount = len(c.serverIDs)
@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
pprofNodeIDs := []string{}
pprofServerIDs := []string{}
// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
for _, serverID := range c.serverIDs {
version := c.getNomadVersion(serverID, "")
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofServerIDs = append(pprofServerIDs, serverID)
}
for _, nodeID := range c.nodeIDs {
version := c.getNomadVersion("", nodeID)
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofNodeIDs = append(pprofNodeIDs, nodeID)
}
// Take the first set of pprofs synchronously...
c.Ui.Output(" Capture pprofInterval 0000")
c.collectPprofs(client, 0)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
if c.pprofInterval == c.pprofDuration {
return
}
@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
return
case <-timer.C:
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
c.collectPprofs(client, pprofIntervalCount)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
timer.Reset(c.pprofInterval)
pprofIntervalCount++
}
@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
}
// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
for _, n := range nodeIDs {
c.collectPprof(clientDir, n, client, interval)
}
for _, n := range c.serverIDs {
for _, n := range serverIDs {
c.collectPprof(serverDir, n, client, interval)
}
}
@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}
// savePprofProfile retrieves a pprof profile and writes to disk
@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
const redirectErr string = `invalid character '<' looking for beginning of value`
return strings.Contains(err.Error(), redirectErr)
}
// getNomadVersion fetches the version of Nomad running on a given server/client node ID
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
if serverID == "" && nodeID == "" {
return ""
}
version := ""
if serverID != "" {
for _, server := range c.members.Members {
// Raft v2 server
if server.Name == serverID {
version = server.Tags["build"]
}
// Raft v3 server
if server.Tags["id"] == serverID {
version = server.Tags["version"]
}
}
}
if nodeID != "" {
for _, node := range c.nodes {
if node.ID == nodeID {
version = node.Version
}
}
}
return version
}
// checkVersion verifies that version satisfies the constraint
func checkVersion(version string, versionConstraint string) error {
v, err := goversion.NewVersion(version)
if err != nil {
return fmt.Errorf("error: %v", err)
}
c, err := goversion.NewConstraint(versionConstraint)
if err != nil {
return fmt.Errorf("error: %v", err)
}
if !c.Check(v) {
return nil
}
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
}

View File

@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
}
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
// filtered by the version constraint.
func TestDebug_PprofVersionCheck(t *testing.T) {
cases := []struct {
version string
errMsg string
}{
{"0.8.7", ""},
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.3", ""},
{"0.11.3+ent", ""},
{"0.12.0", ""},
{"1.3.0", ""},
{"foo.bar", "error: Malformed version: foo.bar"},
}
for _, tc := range cases {
t.Run(tc.version, func(t *testing.T) {
err := checkVersion(tc.version, minimumVersionPprofConstraint)
if tc.errMsg == "" {
require.NoError(t, err, "expected no error from %s", tc.version)
} else {
require.EqualError(t, err, tc.errMsg)
}
})
}
}
func TestDebug_StringToSlice(t *testing.T) {
ci.Parallel(t)