09b5e8d388
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes: * Make first pprof collection synchronous to preserve the existing behavior for the common case where the pprof interval matches the duration. * Clamp `operator debug` pprof timing to that of the command. The `pprof-duration` should be no more than `duration` and the `pprof-interval` should be no more than `pprof-duration`. Clamp the values rather than throwing errors, which could change the commands that existing users might already have in debugging scripts * Testing: remove test parallelism The `operator debug` tests that stand up servers can't be run in parallel, because we don't have a way of canceling the API calls for pprof. The agent will still be running the last pprof when we exit, and that breaks the next test that talks to that same agent. (Because you can only run one pprof at a time on any process!) We could split off each subtest into its own server, but this test suite is already very slow. In future work we should fix this "for real" by making the API call cancelable. * Testing: assert against unexpected errors in `operator debug` tests. If we assert there are no unexpected error outputs, it's easier for the developer to debug when something is going wrong with the tests because the error output will be presented as a failing test, rather than just a failing exit code check. Or worse, no failing exit code check! This also forces us to be explicit about which tests will return 0 exit codes but still emit (presumably ignorable) error outputs. Additional minor bug fixes (mostly in tests) and test refactorings: * Fix text alignment on pprof Duration in `operator debug` output * Remove "done" channel from `operator debug` event stream test. The goroutine we're blocking for here already tells us it's done by sending a value, so block on that instead of an extraneous channel * Event stream test timer should start at current time, not zero * Remove noise from `operator debug` test log output. The `t.Logf` calls already are picked out from the rest of the test output by being prefixed with the filename. * Remove explicit pprof args so we use the defaults clamped from duration/interval
1717 lines
49 KiB
Go
1717 lines
49 KiB
Go
package command
|
|
|
|
import (
|
|
"archive/tar"
|
|
"compress/gzip"
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"html/template"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/hashicorp/go-cleanhttp"
|
|
"github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/nomad/api"
|
|
"github.com/hashicorp/nomad/api/contexts"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/version"
|
|
"github.com/posener/complete"
|
|
)
|
|
|
|
type OperatorDebugCommand struct {
|
|
Meta
|
|
|
|
timestamp string
|
|
collectDir string
|
|
duration time.Duration
|
|
interval time.Duration
|
|
pprofInterval time.Duration
|
|
pprofDuration time.Duration
|
|
logLevel string
|
|
maxNodes int
|
|
nodeClass string
|
|
nodeIDs []string
|
|
serverIDs []string
|
|
topics map[api.Topic][]string
|
|
index uint64
|
|
consul *external
|
|
vault *external
|
|
manifest []string
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
opts *api.QueryOptions
|
|
verbose bool
|
|
}
|
|
|
|
const (
|
|
userAgent = "nomad operator debug"
|
|
clusterDir = "cluster"
|
|
clientDir = "client"
|
|
serverDir = "server"
|
|
intervalDir = "interval"
|
|
)
|
|
|
|
func (c *OperatorDebugCommand) Help() string {
|
|
helpText := `
|
|
Usage: nomad operator debug [options]
|
|
|
|
Build an archive containing Nomad cluster configuration and state, and Consul
|
|
and Vault status. Include logs and pprof profiles for selected servers and
|
|
client nodes.
|
|
|
|
If ACLs are enabled, this command will require a token with the 'node:read'
|
|
capability to run. In order to collect information, the token will also
|
|
require the 'agent:read' and 'operator:read' capabilities, as well as the
|
|
'list-jobs' capability for all namespaces. To collect pprof profiles the
|
|
token will also require 'agent:write', or enable_debug configuration set to
|
|
true.
|
|
|
|
If event stream capture is enabled, the Job, Allocation, Deployment,
|
|
and Evaluation topics require 'namespace:read-job' capabilities, the Node
|
|
topic requires 'node:read'. A 'management' token is required to capture
|
|
ACLToken, ACLPolicy, or all all events.
|
|
|
|
General Options:
|
|
|
|
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
|
|
|
|
Consul Options:
|
|
|
|
-consul-http-addr=<addr>
|
|
The address and port of the Consul HTTP agent. Overrides the
|
|
CONSUL_HTTP_ADDR environment variable.
|
|
|
|
-consul-token=<token>
|
|
Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment
|
|
variable and the Consul token file.
|
|
|
|
-consul-token-file=<path>
|
|
Path to the Consul token file. Overrides the CONSUL_HTTP_TOKEN_FILE
|
|
environment variable.
|
|
|
|
-consul-client-cert=<path>
|
|
Path to the Consul client cert file. Overrides the CONSUL_CLIENT_CERT
|
|
environment variable.
|
|
|
|
-consul-client-key=<path>
|
|
Path to the Consul client key file. Overrides the CONSUL_CLIENT_KEY
|
|
environment variable.
|
|
|
|
-consul-ca-cert=<path>
|
|
Path to a CA file to use with Consul. Overrides the CONSUL_CACERT
|
|
environment variable and the Consul CA path.
|
|
|
|
-consul-ca-path=<path>
|
|
Path to a directory of PEM encoded CA cert files to verify the Consul
|
|
certificate. Overrides the CONSUL_CAPATH environment variable.
|
|
|
|
Vault Options:
|
|
|
|
-vault-address=<addr>
|
|
The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR
|
|
environment variable.
|
|
|
|
-vault-token=<token>
|
|
Token used to query Vault. Overrides the VAULT_TOKEN environment
|
|
variable.
|
|
|
|
-vault-client-cert=<path>
|
|
Path to the Vault client cert file. Overrides the VAULT_CLIENT_CERT
|
|
environment variable.
|
|
|
|
-vault-client-key=<path>
|
|
Path to the Vault client key file. Overrides the VAULT_CLIENT_KEY
|
|
environment variable.
|
|
|
|
-vault-ca-cert=<path>
|
|
Path to a CA file to use with Vault. Overrides the VAULT_CACERT
|
|
environment variable and the Vault CA path.
|
|
|
|
-vault-ca-path=<path>
|
|
Path to a directory of PEM encoded CA cert files to verify the Vault
|
|
certificate. Overrides the VAULT_CAPATH environment variable.
|
|
|
|
Debug Options:
|
|
|
|
-duration=<duration>
|
|
Set the duration of the debug capture. Logs will be captured from specified servers and
|
|
nodes at "log-level". Defaults to 2m.
|
|
|
|
-event-index=<index>
|
|
Specifies the index to start streaming events from. If the requested index is
|
|
no longer in the buffer the stream will start at the next available index.
|
|
Defaults to 0.
|
|
|
|
-event-topic=<Allocation,Evaluation,Job,Node,*>:<filter>
|
|
Enable event stream capture, filtered by comma delimited list of topic filters.
|
|
Examples:
|
|
"all" or "*:*" for all events
|
|
"Evaluation" or "Evaluation:*" for all evaluation events
|
|
"*:example" for all events related to the job "example"
|
|
Defaults to "none" (disabled).
|
|
|
|
-interval=<interval>
|
|
The interval between snapshots of the Nomad state. Set interval equal to
|
|
duration to capture a single snapshot. Defaults to 30s.
|
|
|
|
-log-level=<level>
|
|
The log level to monitor. Defaults to DEBUG.
|
|
|
|
-max-nodes=<count>
|
|
Cap the maximum number of client nodes included in the capture. Defaults
|
|
to 10, set to 0 for unlimited.
|
|
|
|
-node-id=<node1>,<node2>
|
|
Comma separated list of Nomad client node ids to monitor for logs, API
|
|
outputs, and pprof profiles. Accepts id prefixes, and "all" to select all
|
|
nodes (up to count = max-nodes). Defaults to "all".
|
|
|
|
-node-class=<node-class>
|
|
Filter client nodes based on node class.
|
|
|
|
-pprof-duration=<duration>
|
|
Duration for pprof collection. Defaults to 1s or -duration, whichever is less.
|
|
|
|
-pprof-interval=<pprof-interval>
|
|
The interval between pprof collections. Set interval equal to
|
|
duration to capture a single snapshot. Defaults to 250ms or
|
|
-pprof-duration, whichever is less.
|
|
|
|
-server-id=<server1>,<server2>
|
|
Comma separated list of Nomad server names to monitor for logs, API
|
|
outputs, and pprof profiles. Accepts server names, "leader", or "all".
|
|
Defaults to "all".
|
|
|
|
-stale=<true|false>
|
|
If "false", the default, get membership data from the cluster leader. If
|
|
the cluster is in an outage unable to establish leadership, it may be
|
|
necessary to get the configuration from a non-leader server.
|
|
|
|
-output=<path>
|
|
Path to the parent directory of the output directory. If specified, no
|
|
archive is built. Defaults to the current directory.
|
|
|
|
-verbose
|
|
Enable verbose output.
|
|
`
|
|
return strings.TrimSpace(helpText)
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) Synopsis() string {
|
|
return "Build a debug archive"
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
|
|
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
|
|
complete.Flags{
|
|
"-duration": complete.PredictAnything,
|
|
"-event-index": complete.PredictAnything,
|
|
"-event-topic": complete.PredictAnything,
|
|
"-interval": complete.PredictAnything,
|
|
"-log-level": complete.PredictSet("TRACE", "DEBUG", "INFO", "WARN", "ERROR"),
|
|
"-max-nodes": complete.PredictAnything,
|
|
"-node-class": NodeClassPredictor(c.Client),
|
|
"-node-id": NodePredictor(c.Client),
|
|
"-server-id": ServerPredictor(c.Client),
|
|
"-output": complete.PredictDirs("*"),
|
|
"-pprof-duration": complete.PredictAnything,
|
|
"-consul-token": complete.PredictAnything,
|
|
"-vault-token": complete.PredictAnything,
|
|
"-verbose": complete.PredictAnything,
|
|
})
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) AutocompleteArgs() complete.Predictor {
|
|
return complete.PredictNothing
|
|
}
|
|
|
|
// NodePredictor returns a client node predictor
|
|
func NodePredictor(factory ApiClientFactory) complete.Predictor {
|
|
return complete.PredictFunc(func(a complete.Args) []string {
|
|
client, err := factory()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// note we can't use the -stale flag here because we're in the
|
|
// predictor, but a stale query should be safe for prediction;
|
|
// we also can't use region forwarding because we can't rely
|
|
// on the server being up
|
|
resp, _, err := client.Search().PrefixSearch(
|
|
a.Last, contexts.Nodes, &api.QueryOptions{AllowStale: true})
|
|
if err != nil {
|
|
return []string{}
|
|
}
|
|
return resp.Matches[contexts.Nodes]
|
|
})
|
|
}
|
|
|
|
// NodeClassPredictor returns a client node class predictor
|
|
// TODO dmay: Consider API options for node class filtering
|
|
func NodeClassPredictor(factory ApiClientFactory) complete.Predictor {
|
|
return complete.PredictFunc(func(a complete.Args) []string {
|
|
client, err := factory()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// note we can't use the -stale flag here because we're in the
|
|
// predictor, but a stale query should be safe for prediction;
|
|
// we also can't use region forwarding because we can't rely
|
|
// on the server being up
|
|
nodes, _, err := client.Nodes().List(&api.QueryOptions{AllowStale: true})
|
|
if err != nil {
|
|
return []string{}
|
|
}
|
|
|
|
// Build map of unique node classes across all nodes
|
|
classes := make(map[string]bool)
|
|
for _, node := range nodes {
|
|
classes[node.NodeClass] = true
|
|
}
|
|
|
|
// Iterate over node classes looking for match
|
|
filtered := []string{}
|
|
for class := range classes {
|
|
if strings.HasPrefix(class, a.Last) {
|
|
filtered = append(filtered, class)
|
|
}
|
|
}
|
|
|
|
return filtered
|
|
})
|
|
}
|
|
|
|
// ServerPredictor returns a server member predictor
|
|
// TODO dmay: Consider API options for server member filtering
|
|
func ServerPredictor(factory ApiClientFactory) complete.Predictor {
|
|
return complete.PredictFunc(func(a complete.Args) []string {
|
|
client, err := factory()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// note we can't use the -stale flag here because we're in the
|
|
// predictor, but a stale query should be safe for prediction;
|
|
// we also can't use region forwarding because we can't rely
|
|
// on the server being up
|
|
members, err := client.Agent().MembersOpts(&api.QueryOptions{AllowStale: true})
|
|
if err != nil {
|
|
return []string{}
|
|
}
|
|
|
|
// Iterate over server members looking for match
|
|
filtered := []string{}
|
|
for _, member := range members.Members {
|
|
if strings.HasPrefix(member.Name, a.Last) {
|
|
filtered = append(filtered, member.Name)
|
|
}
|
|
}
|
|
|
|
return filtered
|
|
})
|
|
}
|
|
|
|
// queryOpts returns a copy of the shared api.QueryOptions so
|
|
// that api package methods can safely modify the options
|
|
func (c *OperatorDebugCommand) queryOpts() *api.QueryOptions {
|
|
qo := new(api.QueryOptions)
|
|
*qo = *c.opts
|
|
qo.Params = helper.CopyMapStringString(c.opts.Params)
|
|
return qo
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) Name() string { return "debug" }
|
|
|
|
func (c *OperatorDebugCommand) Run(args []string) int {
|
|
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
|
|
flags.Usage = func() { c.Ui.Output(c.Help()) }
|
|
|
|
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
|
|
var eventIndex int64
|
|
var nodeIDs, serverIDs string
|
|
var allowStale bool
|
|
|
|
flags.StringVar(&duration, "duration", "2m", "")
|
|
flags.Int64Var(&eventIndex, "event-index", 0, "")
|
|
flags.StringVar(&eventTopic, "event-topic", "none", "")
|
|
flags.StringVar(&interval, "interval", "30s", "")
|
|
flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
|
|
flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
|
|
flags.StringVar(&c.nodeClass, "node-class", "", "")
|
|
flags.StringVar(&nodeIDs, "node-id", "all", "")
|
|
flags.StringVar(&serverIDs, "server-id", "all", "")
|
|
flags.BoolVar(&allowStale, "stale", false, "")
|
|
flags.StringVar(&output, "output", "", "")
|
|
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
|
|
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
|
|
flags.BoolVar(&c.verbose, "verbose", false, "")
|
|
|
|
c.consul = &external{tls: &api.TLSConfig{}}
|
|
flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "")
|
|
ssl := os.Getenv("CONSUL_HTTP_SSL")
|
|
c.consul.ssl, _ = strconv.ParseBool(ssl)
|
|
flags.StringVar(&c.consul.auth, "consul-auth", os.Getenv("CONSUL_HTTP_AUTH"), "")
|
|
flags.StringVar(&c.consul.tokenVal, "consul-token", os.Getenv("CONSUL_HTTP_TOKEN"), "")
|
|
flags.StringVar(&c.consul.tokenFile, "consul-token-file", os.Getenv("CONSUL_HTTP_TOKEN_FILE"), "")
|
|
flags.StringVar(&c.consul.tls.ClientCert, "consul-client-cert", os.Getenv("CONSUL_CLIENT_CERT"), "")
|
|
flags.StringVar(&c.consul.tls.ClientKey, "consul-client-key", os.Getenv("CONSUL_CLIENT_KEY"), "")
|
|
flags.StringVar(&c.consul.tls.CACert, "consul-ca-cert", os.Getenv("CONSUL_CACERT"), "")
|
|
flags.StringVar(&c.consul.tls.CAPath, "consul-ca-path", os.Getenv("CONSUL_CAPATH"), "")
|
|
|
|
c.vault = &external{tls: &api.TLSConfig{}}
|
|
flags.StringVar(&c.vault.addrVal, "vault-address", os.Getenv("VAULT_ADDR"), "")
|
|
flags.StringVar(&c.vault.tokenVal, "vault-token", os.Getenv("VAULT_TOKEN"), "")
|
|
flags.StringVar(&c.vault.tls.CACert, "vault-ca-cert", os.Getenv("VAULT_CACERT"), "")
|
|
flags.StringVar(&c.vault.tls.CAPath, "vault-ca-path", os.Getenv("VAULT_CAPATH"), "")
|
|
flags.StringVar(&c.vault.tls.ClientCert, "vault-client-cert", os.Getenv("VAULT_CLIENT_CERT"), "")
|
|
flags.StringVar(&c.vault.tls.ClientKey, "vault-client-key", os.Getenv("VAULT_CLIENT_KEY"), "")
|
|
|
|
if err := flags.Parse(args); err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing arguments: %q", err))
|
|
return 1
|
|
}
|
|
|
|
// Parse the capture duration
|
|
d, err := time.ParseDuration(duration)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error()))
|
|
return 1
|
|
}
|
|
c.duration = d
|
|
|
|
// Parse the capture interval
|
|
i, err := time.ParseDuration(interval)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error()))
|
|
return 1
|
|
}
|
|
c.interval = i
|
|
|
|
// Validate interval
|
|
if i.Seconds() > d.Seconds() {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing interval: %s is greater than duration %s", interval, duration))
|
|
return 1
|
|
}
|
|
|
|
// Parse and clamp the pprof capture duration
|
|
pd, err := time.ParseDuration(pprofDuration)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing pprof duration: %s: %s", pprofDuration, err.Error()))
|
|
return 1
|
|
}
|
|
if pd.Seconds() > d.Seconds() {
|
|
pd = d
|
|
}
|
|
c.pprofDuration = pd
|
|
|
|
// Parse and clamp the pprof capture interval
|
|
pi, err := time.ParseDuration(pprofInterval)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
|
|
return 1
|
|
}
|
|
if pi.Seconds() > pd.Seconds() {
|
|
pi = pd
|
|
}
|
|
c.pprofInterval = pi
|
|
|
|
// Parse event stream topic filter
|
|
t, err := topicsFromString(eventTopic)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error parsing event topics: %v", err))
|
|
return 1
|
|
}
|
|
c.topics = t
|
|
|
|
// Validate and set initial event stream index
|
|
if eventIndex < 0 {
|
|
c.Ui.Error("Event stream index must be greater than zero")
|
|
return 1
|
|
}
|
|
c.index = uint64(eventIndex)
|
|
|
|
// Verify there are no extra arguments
|
|
args = flags.Args()
|
|
if l := len(args); l != 0 {
|
|
c.Ui.Error("This command takes no arguments")
|
|
c.Ui.Error(commandErrorText(c))
|
|
return 1
|
|
}
|
|
|
|
// Initialize capture variables and structs
|
|
c.manifest = make([]string, 0)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
c.ctx = ctx
|
|
c.cancel = cancel
|
|
c.trap()
|
|
|
|
// Generate timestamped file name
|
|
format := "2006-01-02-150405Z"
|
|
c.timestamp = time.Now().UTC().Format(format)
|
|
stamped := "nomad-debug-" + c.timestamp
|
|
|
|
// Create the output directory
|
|
var tmp string
|
|
if output != "" {
|
|
// User specified output directory
|
|
tmp = filepath.Join(output, stamped)
|
|
_, err := os.Stat(tmp)
|
|
if !os.IsNotExist(err) {
|
|
c.Ui.Error("Output directory already exists")
|
|
return 2
|
|
}
|
|
} else {
|
|
// Generate temp directory
|
|
tmp, err = ioutil.TempDir(os.TempDir(), stamped)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
|
|
return 2
|
|
}
|
|
defer os.RemoveAll(tmp)
|
|
}
|
|
|
|
c.collectDir = tmp
|
|
|
|
// Write CLI flags to JSON file
|
|
c.writeFlags(flags)
|
|
|
|
// Create an instance of the API client
|
|
client, err := c.Meta.Client()
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error()))
|
|
return 1
|
|
}
|
|
|
|
c.opts = &api.QueryOptions{
|
|
Region: c.Meta.region,
|
|
AllowStale: allowStale,
|
|
AuthToken: c.Meta.token,
|
|
}
|
|
|
|
// Search all nodes If a node class is specified without a list of node id prefixes
|
|
if c.nodeClass != "" && nodeIDs == "" {
|
|
nodeIDs = "all"
|
|
}
|
|
|
|
// Resolve client node id prefixes
|
|
nodesFound := 0
|
|
nodeLookupFailCount := 0
|
|
nodeCaptureCount := 0
|
|
|
|
for _, id := range stringToSlice(nodeIDs) {
|
|
if id == "all" {
|
|
// Capture from all nodes using empty prefix filter
|
|
id = ""
|
|
} else {
|
|
// Capture from nodes starting with prefix id
|
|
id = sanitizeUUIDPrefix(id)
|
|
}
|
|
nodes, _, err := client.Nodes().PrefixListOpts(id, c.queryOpts())
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err))
|
|
return 1
|
|
}
|
|
|
|
// Increment fail count if no nodes are found
|
|
if len(nodes) == 0 {
|
|
c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id))
|
|
nodeLookupFailCount++
|
|
continue
|
|
}
|
|
|
|
nodesFound += len(nodes)
|
|
|
|
// Apply constraints to nodes found
|
|
for _, n := range nodes {
|
|
// Ignore nodes that do not match specified class
|
|
if c.nodeClass != "" && n.NodeClass != c.nodeClass {
|
|
continue
|
|
}
|
|
|
|
// Add node to capture list
|
|
c.nodeIDs = append(c.nodeIDs, n.ID)
|
|
nodeCaptureCount++
|
|
|
|
// Stop looping when we reach the max
|
|
if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Return error if nodes were specified but none were found
|
|
if len(nodeIDs) > 0 && nodeCaptureCount == 0 {
|
|
if nodeIDs == "all" {
|
|
// It's okay to have zero clients for default "all"
|
|
c.Ui.Info("Note: \"-node-id=all\" specified but no clients found")
|
|
} else {
|
|
c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs))
|
|
return 1
|
|
}
|
|
}
|
|
|
|
// Resolve servers
|
|
members, err := client.Agent().MembersOpts(c.queryOpts())
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
|
|
return 1
|
|
}
|
|
|
|
// Write complete list of server members to file
|
|
c.writeJSON(clusterDir, "members.json", members, err)
|
|
|
|
// Filter for servers matching criteria
|
|
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
|
|
return 1
|
|
}
|
|
|
|
serversFound := 0
|
|
serverCaptureCount := 0
|
|
|
|
if members != nil {
|
|
serversFound = len(members.Members)
|
|
}
|
|
if c.serverIDs != nil {
|
|
serverCaptureCount = len(c.serverIDs)
|
|
}
|
|
|
|
// Return error if servers were specified but not found
|
|
if len(serverIDs) > 0 && serverCaptureCount == 0 {
|
|
c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
|
|
return 1
|
|
}
|
|
|
|
// Display general info about the capture
|
|
c.Ui.Output("Starting debugger...")
|
|
c.Ui.Output("")
|
|
c.Ui.Output(fmt.Sprintf("Nomad CLI Version: %s", version.GetVersion().FullVersionNumber(true)))
|
|
c.Ui.Output(fmt.Sprintf(" Region: %s", c.region))
|
|
c.Ui.Output(fmt.Sprintf(" Namespace: %s", c.namespace))
|
|
c.Ui.Output(fmt.Sprintf(" Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs))
|
|
c.Ui.Output(fmt.Sprintf(" Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs))
|
|
if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes {
|
|
c.Ui.Output(fmt.Sprintf(" Max node count reached (%d)", c.maxNodes))
|
|
}
|
|
if nodeLookupFailCount > 0 {
|
|
c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount))
|
|
}
|
|
if c.nodeClass != "" {
|
|
c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass))
|
|
}
|
|
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
|
|
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
|
|
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
|
|
if c.pprofDuration.Seconds() != 1 {
|
|
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
|
|
}
|
|
if c.topics != nil {
|
|
c.Ui.Output(fmt.Sprintf(" Event topics: %+v", c.topics))
|
|
}
|
|
c.Ui.Output("")
|
|
c.Ui.Output("Capturing cluster data...")
|
|
|
|
// Start collecting data
|
|
err = c.collect(client)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error()))
|
|
return 2
|
|
}
|
|
|
|
// Write index json/html manifest files
|
|
c.writeManifest()
|
|
|
|
// Exit before archive if output directory was specified
|
|
if output != "" {
|
|
c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir))
|
|
return 0
|
|
}
|
|
|
|
// Create archive tarball
|
|
archiveFile := stamped + ".tar.gz"
|
|
err = TarCZF(archiveFile, tmp, stamped)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error()))
|
|
return 2
|
|
}
|
|
|
|
// Final output with name of tarball
|
|
c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile))
|
|
return 0
|
|
}
|
|
|
|
// collect collects data from our endpoints and writes the archive bundle
|
|
func (c *OperatorDebugCommand) collect(client *api.Client) error {
|
|
// Start background captures
|
|
c.startMonitors(client)
|
|
c.startEventStream(client)
|
|
|
|
// Collect cluster data
|
|
self, err := client.Agent().Self()
|
|
c.writeJSON(clusterDir, "agent-self.json", self, err)
|
|
|
|
namespaces, _, err := client.Namespaces().List(c.queryOpts())
|
|
c.writeJSON(clusterDir, "namespaces.json", namespaces, err)
|
|
|
|
regions, err := client.Regions().List()
|
|
c.writeJSON(clusterDir, "regions.json", regions, err)
|
|
|
|
// Collect data from Consul
|
|
if c.consul.addrVal == "" {
|
|
c.getConsulAddrFromSelf(self)
|
|
}
|
|
c.collectConsul(clusterDir)
|
|
|
|
// Collect data from Vault
|
|
vaultAddr := c.vault.addrVal
|
|
if vaultAddr == "" {
|
|
vaultAddr = c.getVaultAddrFromSelf(self)
|
|
}
|
|
c.collectVault(clusterDir, vaultAddr)
|
|
|
|
c.collectAgentHosts(client)
|
|
c.collectPeriodicPprofs(client)
|
|
|
|
c.collectPeriodic(client)
|
|
|
|
return nil
|
|
}
|
|
|
|
// path returns platform specific paths in the tmp root directory
|
|
func (c *OperatorDebugCommand) path(paths ...string) string {
|
|
ps := []string{c.collectDir}
|
|
ps = append(ps, paths...)
|
|
return filepath.Join(ps...)
|
|
}
|
|
|
|
// mkdir creates directories in the tmp root directory
|
|
func (c *OperatorDebugCommand) mkdir(paths ...string) error {
|
|
joinedPath := c.path(paths...)
|
|
|
|
// Ensure path doesn't escape the sandbox of the capture directory
|
|
escapes := helper.PathEscapesSandbox(c.collectDir, joinedPath)
|
|
if escapes {
|
|
return fmt.Errorf("file path escapes capture directory")
|
|
}
|
|
|
|
return os.MkdirAll(joinedPath, 0755)
|
|
}
|
|
|
|
// startMonitors starts go routines for each node and client
|
|
func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
|
|
for _, id := range c.nodeIDs {
|
|
go c.startMonitor(clientDir, "node_id", id, client)
|
|
}
|
|
|
|
for _, id := range c.serverIDs {
|
|
go c.startMonitor(serverDir, "server_id", id, client)
|
|
}
|
|
}
|
|
|
|
// startMonitor starts one monitor api request, writing to a file. It blocks and should be
|
|
// called in a go routine. Errors are ignored, we want to build the archive even if a node
|
|
// is unavailable
|
|
func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) {
|
|
c.mkdir(path, nodeID)
|
|
fh, err := os.Create(c.path(path, nodeID, "monitor.log"))
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer fh.Close()
|
|
|
|
qo := api.QueryOptions{
|
|
Params: map[string]string{
|
|
idKey: nodeID,
|
|
"log_level": c.logLevel,
|
|
},
|
|
AllowStale: c.queryOpts().AllowStale,
|
|
}
|
|
|
|
outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo)
|
|
for {
|
|
select {
|
|
case out := <-outCh:
|
|
if out == nil {
|
|
continue
|
|
}
|
|
fh.Write(out.Data)
|
|
|
|
case err := <-errCh:
|
|
fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error()))
|
|
return
|
|
|
|
case <-c.ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// captureEventStream wraps the event stream capture process.
|
|
func (c *OperatorDebugCommand) startEventStream(client *api.Client) {
|
|
c.verboseOut("Launching eventstream goroutine...")
|
|
|
|
go func() {
|
|
if err := c.captureEventStream(client); err != nil {
|
|
var es string
|
|
if mErr, ok := err.(*multierror.Error); ok {
|
|
es = multierror.ListFormatFunc(mErr.Errors)
|
|
} else {
|
|
es = err.Error()
|
|
}
|
|
|
|
c.Ui.Error(fmt.Sprintf("Error capturing event stream: %s", es))
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) captureEventStream(client *api.Client) error {
|
|
// Ensure output directory is present
|
|
path := clusterDir
|
|
if err := c.mkdir(c.path(path)); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create the output file
|
|
fh, err := os.Create(c.path(path, "eventstream.json"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer fh.Close()
|
|
|
|
// Get handle to events endpoint
|
|
events := client.EventStream()
|
|
|
|
// Start streaming events
|
|
eventCh, err := events.Stream(c.ctx, c.topics, c.index, c.queryOpts())
|
|
if err != nil {
|
|
if errors.Is(err, context.Canceled) {
|
|
c.verboseOut("Event stream canceled: No events captured")
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to stream events: %w", err)
|
|
}
|
|
|
|
eventCount := 0
|
|
errCount := 0
|
|
heartbeatCount := 0
|
|
channelEventCount := 0
|
|
|
|
var mErrs *multierror.Error
|
|
|
|
for {
|
|
select {
|
|
case event := <-eventCh:
|
|
channelEventCount++
|
|
if event.Err != nil {
|
|
errCount++
|
|
c.verboseOutf("error from event stream: index; %d err: %v", event.Index, event.Err)
|
|
mErrs = multierror.Append(mErrs, fmt.Errorf("error at index: %d, Err: %w", event.Index, event.Err))
|
|
break
|
|
}
|
|
|
|
if event.IsHeartbeat() {
|
|
heartbeatCount++
|
|
continue
|
|
}
|
|
|
|
for _, e := range event.Events {
|
|
eventCount++
|
|
c.verboseOutf("Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s", eventCount, e.Index, e.Topic, e.Type, e.FilterKeys)
|
|
|
|
bytes, err := json.Marshal(e)
|
|
if err != nil {
|
|
errCount++
|
|
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to marshal json from Topic: %s, Type: %s, Err: %w", e.Topic, e.Type, err))
|
|
}
|
|
|
|
n, err := fh.Write(bytes)
|
|
if err != nil {
|
|
errCount++
|
|
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write bytes to eventstream.json; bytes written: %d, Err: %w", n, err))
|
|
break
|
|
}
|
|
n, err = fh.WriteString("\n")
|
|
if err != nil {
|
|
errCount++
|
|
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write string to eventstream.json; chars written: %d, Err: %w", n, err))
|
|
}
|
|
}
|
|
case <-c.ctx.Done():
|
|
c.verboseOutf("Event stream captured %d events, %d frames, %d heartbeats, %d errors", eventCount, channelEventCount, heartbeatCount, errCount)
|
|
return mErrs.ErrorOrNil()
|
|
}
|
|
}
|
|
}
|
|
|
|
// collectAgentHosts calls collectAgentHost for each selected node
|
|
func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
|
|
for _, n := range c.nodeIDs {
|
|
c.collectAgentHost(clientDir, n, client)
|
|
}
|
|
|
|
for _, n := range c.serverIDs {
|
|
c.collectAgentHost(serverDir, n, client)
|
|
}
|
|
}
|
|
|
|
// collectAgentHost gets the agent host data
|
|
func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) {
|
|
var host *api.HostDataResponse
|
|
var err error
|
|
if path == serverDir {
|
|
host, err = client.Agent().Host(id, "", c.queryOpts())
|
|
} else {
|
|
host, err = client.Agent().Host("", id, c.queryOpts())
|
|
}
|
|
|
|
if isRedirectError(err) {
|
|
c.Ui.Warn(fmt.Sprintf("%s/%s: /v1/agent/host unavailable on this agent", path, id))
|
|
return
|
|
}
|
|
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("%s/%s: Failed to retrieve agent host data, err: %v", path, id, err))
|
|
|
|
if strings.Contains(err.Error(), structs.ErrPermissionDenied.Error()) {
|
|
// Drop a hint to help the operator resolve the error
|
|
c.Ui.Warn("Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information.")
|
|
}
|
|
return // exit on any error
|
|
}
|
|
|
|
path = filepath.Join(path, id)
|
|
c.writeJSON(path, "agent-host.json", host, err)
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
|
|
|
// Take the first set of pprofs synchronously...
|
|
c.Ui.Output(" Capture pprofInterval 0000")
|
|
c.collectPprofs(client, 0)
|
|
if c.pprofInterval == c.pprofDuration {
|
|
return
|
|
}
|
|
|
|
// ... and then move the rest off into a goroutine
|
|
go func() {
|
|
ctx, cancel := context.WithTimeout(c.ctx, c.duration)
|
|
defer cancel()
|
|
timer, stop := helper.NewSafeTimer(c.pprofInterval)
|
|
defer stop()
|
|
|
|
pprofIntervalCount := 1
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-timer.C:
|
|
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
|
|
c.collectPprofs(client, pprofIntervalCount)
|
|
timer.Reset(c.pprofInterval)
|
|
pprofIntervalCount++
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// collectPprofs captures the /agent/pprof for each listed node
|
|
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
|
|
for _, n := range c.nodeIDs {
|
|
c.collectPprof(clientDir, n, client, interval)
|
|
}
|
|
|
|
for _, n := range c.serverIDs {
|
|
c.collectPprof(serverDir, n, client, interval)
|
|
}
|
|
}
|
|
|
|
// collectPprof captures pprof data for the node
|
|
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
|
|
pprofDurationSeconds := int(c.pprofDuration.Seconds())
|
|
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
|
|
if path == serverDir {
|
|
opts.ServerID = id
|
|
} else {
|
|
opts.NodeID = id
|
|
}
|
|
|
|
path = filepath.Join(path, id)
|
|
filename := fmt.Sprintf("profile_%04d.prof", interval)
|
|
|
|
bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
|
|
if structs.IsErrPermissionDenied(err) {
|
|
// All Profiles require the same permissions, so we only need to see
|
|
// one permission failure before we bail.
|
|
// But lets first drop a hint to help the operator resolve the error
|
|
|
|
c.Ui.Warn("Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information.")
|
|
return // only exit on 403
|
|
}
|
|
} else {
|
|
err := c.writeBytes(path, filename, bs)
|
|
if err != nil {
|
|
c.Ui.Error(err.Error())
|
|
}
|
|
}
|
|
|
|
// goroutine debug type 1 = legacy text format for human readable output
|
|
opts.Debug = 1
|
|
c.savePprofProfile(path, "goroutine", opts, client)
|
|
|
|
// goroutine debug type 2 = goroutine stacks in panic format
|
|
opts.Debug = 2
|
|
c.savePprofProfile(path, "goroutine", opts, client)
|
|
|
|
// Reset to pprof binary format
|
|
opts.Debug = 0
|
|
|
|
c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines
|
|
c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program
|
|
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
|
|
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
|
|
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
|
|
|
|
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
|
|
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
|
|
|
|
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
|
|
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
|
|
}
|
|
|
|
// savePprofProfile retrieves a pprof profile and writes to disk
|
|
func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) {
|
|
fileName := fmt.Sprintf("%s.prof", profile)
|
|
if opts.Debug > 0 {
|
|
fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug)
|
|
}
|
|
|
|
bs, err := retrievePprofProfile(profile, opts, client, c.queryOpts())
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error()))
|
|
}
|
|
|
|
err = c.writeBytes(path, fileName, bs)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error()))
|
|
}
|
|
}
|
|
|
|
// retrievePprofProfile gets a pprof profile from the node specified
|
|
// in opts using the API client
|
|
func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client, qopts *api.QueryOptions) (bs []byte, err error) {
|
|
switch profile {
|
|
case "cpuprofile":
|
|
bs, err = client.Agent().CPUProfile(opts, qopts)
|
|
case "trace":
|
|
bs, err = client.Agent().Trace(opts, qopts)
|
|
default:
|
|
bs, err = client.Agent().Lookup(profile, opts, qopts)
|
|
}
|
|
|
|
return bs, err
|
|
}
|
|
|
|
// collectPeriodic runs for duration, capturing the cluster state
|
|
// every interval. It flushes and stops the monitor requests
|
|
func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) {
|
|
duration := time.After(c.duration)
|
|
// Set interval to 0 so that we immediately execute, wait the interval next time
|
|
interval := time.After(0 * time.Second)
|
|
var intervalCount int
|
|
var name, dir string
|
|
|
|
for {
|
|
select {
|
|
case <-duration:
|
|
c.cancel()
|
|
return
|
|
|
|
case <-interval:
|
|
name = fmt.Sprintf("%04d", intervalCount)
|
|
dir = filepath.Join(intervalDir, name)
|
|
c.Ui.Output(fmt.Sprintf(" Capture interval %s", name))
|
|
c.collectNomad(dir, client)
|
|
c.collectOperator(dir, client)
|
|
interval = time.After(c.interval)
|
|
intervalCount++
|
|
|
|
case <-c.ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// collectOperator captures some cluster meta information
|
|
func (c *OperatorDebugCommand) collectOperator(dir string, client *api.Client) {
|
|
rc, err := client.Operator().RaftGetConfiguration(c.queryOpts())
|
|
c.writeJSON(dir, "operator-raft.json", rc, err)
|
|
|
|
sc, _, err := client.Operator().SchedulerGetConfiguration(c.queryOpts())
|
|
c.writeJSON(dir, "operator-scheduler.json", sc, err)
|
|
|
|
ah, _, err := client.Operator().AutopilotServerHealth(c.queryOpts())
|
|
c.writeJSON(dir, "operator-autopilot-health.json", ah, err)
|
|
|
|
lic, _, err := client.Operator().LicenseGet(c.queryOpts())
|
|
c.writeJSON(dir, "license.json", lic, err)
|
|
}
|
|
|
|
// collectNomad captures the nomad cluster state
|
|
func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) error {
|
|
|
|
js, _, err := client.Jobs().List(c.queryOpts())
|
|
c.writeJSON(dir, "jobs.json", js, err)
|
|
|
|
ds, _, err := client.Deployments().List(c.queryOpts())
|
|
c.writeJSON(dir, "deployments.json", ds, err)
|
|
|
|
es, _, err := client.Evaluations().List(c.queryOpts())
|
|
c.writeJSON(dir, "evaluations.json", es, err)
|
|
|
|
as, _, err := client.Allocations().List(c.queryOpts())
|
|
c.writeJSON(dir, "allocations.json", as, err)
|
|
|
|
ns, _, err := client.Nodes().List(c.queryOpts())
|
|
c.writeJSON(dir, "nodes.json", ns, err)
|
|
|
|
// CSI Plugins - /v1/plugins?type=csi
|
|
ps, _, err := client.CSIPlugins().List(c.queryOpts())
|
|
c.writeJSON(dir, "csi-plugins.json", ps, err)
|
|
|
|
// CSI Plugin details - /v1/plugin/csi/:plugin_id
|
|
for _, p := range ps {
|
|
csiPlugin, _, err := client.CSIPlugins().Info(p.ID, c.queryOpts())
|
|
csiPluginFileName := fmt.Sprintf("csi-plugin-id-%s.json", p.ID)
|
|
c.writeJSON(dir, csiPluginFileName, csiPlugin, err)
|
|
}
|
|
|
|
// CSI Volumes - /v1/volumes?type=csi
|
|
csiVolumes, _, err := client.CSIVolumes().List(c.queryOpts())
|
|
c.writeJSON(dir, "csi-volumes.json", csiVolumes, err)
|
|
|
|
// CSI Volume details - /v1/volumes/csi/:volume-id
|
|
for _, v := range csiVolumes {
|
|
csiVolume, _, err := client.CSIVolumes().Info(v.ID, c.queryOpts())
|
|
csiFileName := fmt.Sprintf("csi-volume-id-%s.json", v.ID)
|
|
c.writeJSON(dir, csiFileName, csiVolume, err)
|
|
}
|
|
|
|
metrics, _, err := client.Operator().MetricsSummary(c.queryOpts())
|
|
c.writeJSON(dir, "metrics.json", metrics, err)
|
|
|
|
return nil
|
|
}
|
|
|
|
// collectConsul calls the Consul API to collect data
|
|
func (c *OperatorDebugCommand) collectConsul(dir string) {
|
|
if c.consul.addrVal == "" {
|
|
c.Ui.Output("Consul - Skipping, no API address found")
|
|
return
|
|
}
|
|
|
|
c.Ui.Info(fmt.Sprintf("Consul - Collecting Consul API data from: %s", c.consul.addrVal))
|
|
|
|
client, err := c.consulAPIClient()
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("failed to create Consul API client: %s", err))
|
|
return
|
|
}
|
|
|
|
// Exit if we are unable to retrieve the leader
|
|
err = c.collectConsulAPIRequest(client, "/v1/status/leader", dir, "consul-leader.json")
|
|
if err != nil {
|
|
c.Ui.Output(fmt.Sprintf("Unable to contact Consul leader, skipping: %s", err))
|
|
return
|
|
}
|
|
|
|
c.collectConsulAPI(client, "/v1/agent/host", dir, "consul-agent-host.json")
|
|
c.collectConsulAPI(client, "/v1/agent/members", dir, "consul-agent-members.json")
|
|
c.collectConsulAPI(client, "/v1/agent/metrics", dir, "consul-agent-metrics.json")
|
|
c.collectConsulAPI(client, "/v1/agent/self", dir, "consul-agent-self.json")
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) consulAPIClient() (*http.Client, error) {
|
|
httpClient := defaultHttpClient()
|
|
|
|
err := api.ConfigureTLS(httpClient, c.consul.tls)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to configure TLS: %w", err)
|
|
}
|
|
|
|
return httpClient, nil
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) collectConsulAPI(client *http.Client, urlPath string, dir string, file string) {
|
|
err := c.collectConsulAPIRequest(client, urlPath, dir, file)
|
|
if err != nil {
|
|
c.Ui.Error(fmt.Sprintf("Error collecting from Consul API: %s", err.Error()))
|
|
}
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) collectConsulAPIRequest(client *http.Client, urlPath string, dir string, file string) error {
|
|
url := c.consul.addrVal + urlPath
|
|
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create HTTP request for Consul API URL=%q: %w", url, err)
|
|
}
|
|
|
|
req.Header.Add("X-Consul-Token", c.consul.token())
|
|
req.Header.Add("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.writeBody(dir, file, resp, err)
|
|
|
|
return nil
|
|
}
|
|
|
|
// collectVault calls the Vault API directly to collect data
|
|
func (c *OperatorDebugCommand) collectVault(dir, vault string) error {
|
|
vaultAddr := c.vault.addr(vault)
|
|
if vaultAddr == "" {
|
|
return nil
|
|
}
|
|
|
|
c.Ui.Info(fmt.Sprintf("Vault - Collecting Vault API data from: %s", vaultAddr))
|
|
client := defaultHttpClient()
|
|
if c.vault.ssl {
|
|
err := api.ConfigureTLS(client, c.vault.tls)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to configure TLS: %w", err)
|
|
}
|
|
}
|
|
|
|
req, err := http.NewRequest("GET", vaultAddr+"/v1/sys/health", nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create HTTP request for Vault API URL=%q: %w", vaultAddr, err)
|
|
}
|
|
|
|
req.Header.Add("X-Vault-Token", c.vault.token())
|
|
req.Header.Add("User-Agent", userAgent)
|
|
resp, err := client.Do(req)
|
|
c.writeBody(dir, "vault-sys-health.json", resp, err)
|
|
|
|
return nil
|
|
}
|
|
|
|
// writeBytes writes a file to the archive, recording it in the manifest
|
|
func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error {
|
|
// Replace invalid characters in filename
|
|
filename := helper.CleanFilename(file, "_")
|
|
|
|
relativePath := filepath.Join(dir, filename)
|
|
c.manifest = append(c.manifest, relativePath)
|
|
dirPath := filepath.Join(c.collectDir, dir)
|
|
filePath := filepath.Join(dirPath, filename)
|
|
|
|
// Ensure parent directories exist
|
|
err := os.MkdirAll(dirPath, os.ModePerm)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create parent directories of \"%s\": %w", dirPath, err)
|
|
}
|
|
|
|
// Ensure filename doesn't escape the sandbox of the capture directory
|
|
escapes := helper.PathEscapesSandbox(c.collectDir, filePath)
|
|
if escapes {
|
|
return fmt.Errorf("file path \"%s\" escapes capture directory \"%s\"", filePath, c.collectDir)
|
|
}
|
|
|
|
// Create the file
|
|
fh, err := os.Create(filePath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create file \"%s\", err: %w", filePath, err)
|
|
}
|
|
defer fh.Close()
|
|
|
|
_, err = fh.Write(data)
|
|
if err != nil {
|
|
return fmt.Errorf("Failed to write data to file \"%s\", err: %w", filePath, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// writeJSON writes JSON responses from the Nomad API calls to the archive
|
|
func (c *OperatorDebugCommand) writeJSON(dir, file string, data interface{}, err error) error {
|
|
if err != nil {
|
|
return c.writeError(dir, file, err)
|
|
}
|
|
bytes, err := json.Marshal(data)
|
|
if err != nil {
|
|
return c.writeError(dir, file, err)
|
|
}
|
|
err = c.writeBytes(dir, file, bytes)
|
|
if err != nil {
|
|
c.Ui.Error(err.Error())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// writeError writes a JSON error object to capture errors in the debug bundle without
|
|
// reporting
|
|
func (c *OperatorDebugCommand) writeError(dir, file string, err error) error {
|
|
bytes, err := json.Marshal(errorWrapper{Error: err.Error()})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return c.writeBytes(dir, file, bytes)
|
|
}
|
|
|
|
type errorWrapper struct {
|
|
Error string
|
|
}
|
|
|
|
// writeBody is a helper that writes the body of an http.Response to the archive
|
|
func (c *OperatorDebugCommand) writeBody(dir, file string, resp *http.Response, err error) {
|
|
if err != nil {
|
|
c.writeError(dir, file, err)
|
|
return
|
|
}
|
|
|
|
if resp.ContentLength == 0 {
|
|
return
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
c.writeError(dir, file, err)
|
|
return
|
|
}
|
|
|
|
if err := c.writeBytes(dir, file, body); err != nil {
|
|
c.Ui.Error(err.Error())
|
|
}
|
|
}
|
|
|
|
type flagExport struct {
|
|
Name string
|
|
Parsed bool
|
|
Actual map[string]*flag.Flag
|
|
Formal map[string]*flag.Flag
|
|
Effective map[string]*flag.Flag // All flags with non-empty value
|
|
Args []string // arguments after flags
|
|
OsArgs []string
|
|
}
|
|
|
|
// writeFlags exports the CLI flags to JSON file
|
|
func (c *OperatorDebugCommand) writeFlags(flags *flag.FlagSet) {
|
|
// c.writeJSON(clusterDir, "cli-flags-complete.json", flags, nil)
|
|
|
|
var f flagExport
|
|
f.Name = flags.Name()
|
|
f.Parsed = flags.Parsed()
|
|
f.Formal = make(map[string]*flag.Flag)
|
|
f.Actual = make(map[string]*flag.Flag)
|
|
f.Effective = make(map[string]*flag.Flag)
|
|
f.Args = flags.Args()
|
|
f.OsArgs = os.Args
|
|
|
|
// Formal flags (all flags)
|
|
flags.VisitAll(func(flagA *flag.Flag) {
|
|
f.Formal[flagA.Name] = flagA
|
|
|
|
// Determine which of thees are "effective" flags by comparing to empty string
|
|
if flagA.Value.String() != "" {
|
|
f.Effective[flagA.Name] = flagA
|
|
}
|
|
})
|
|
// Actual flags (everything passed on cmdline)
|
|
flags.Visit(func(flag *flag.Flag) {
|
|
f.Actual[flag.Name] = flag
|
|
})
|
|
|
|
c.writeJSON(clusterDir, "cli-flags.json", f, nil)
|
|
}
|
|
|
|
// writeManifest creates the index files
|
|
func (c *OperatorDebugCommand) writeManifest() error {
|
|
// Write the JSON
|
|
path := filepath.Join(c.collectDir, "index.json")
|
|
jsonFh, err := os.Create(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer jsonFh.Close()
|
|
|
|
json.NewEncoder(jsonFh).Encode(c.manifest)
|
|
|
|
// Write the HTML
|
|
path = filepath.Join(c.collectDir, "index.html")
|
|
htmlFh, err := os.Create(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer htmlFh.Close()
|
|
|
|
head, _ := template.New("head").Parse("<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>")
|
|
line, _ := template.New("line").Parse("<li><a href=\"{{.}}\">{{.}}</a></li>\n")
|
|
if err != nil {
|
|
return fmt.Errorf("%v", err)
|
|
}
|
|
tail := "</ul></body></html>\n"
|
|
|
|
head.Execute(htmlFh, c.timestamp)
|
|
for _, f := range c.manifest {
|
|
line.Execute(htmlFh, f)
|
|
}
|
|
htmlFh.WriteString(tail)
|
|
|
|
return nil
|
|
}
|
|
|
|
// trap captures signals, and closes stopCh
|
|
func (c *OperatorDebugCommand) trap() {
|
|
sigCh := make(chan os.Signal, 1)
|
|
signal.Notify(sigCh,
|
|
syscall.SIGHUP,
|
|
syscall.SIGINT,
|
|
syscall.SIGTERM,
|
|
syscall.SIGQUIT)
|
|
|
|
go func() {
|
|
<-sigCh
|
|
c.cancel()
|
|
}()
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) verboseOut(out string) {
|
|
if c.verbose {
|
|
c.Ui.Output(out)
|
|
}
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) verboseOutf(format string, a ...interface{}) {
|
|
c.verboseOut(fmt.Sprintf(format, a...))
|
|
}
|
|
|
|
// TarCZF like the tar command, recursively builds a gzip compressed tar
|
|
// archive from a directory. If not empty, all files in the bundle are prefixed
|
|
// with the target path.
|
|
func TarCZF(archive string, src, target string) error {
|
|
// ensure the src actually exists before trying to tar it
|
|
if _, err := os.Stat(src); err != nil {
|
|
return fmt.Errorf("Unable to tar files - %v", err.Error())
|
|
}
|
|
|
|
// create the archive
|
|
fh, err := os.Create(archive)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer fh.Close()
|
|
|
|
zz := gzip.NewWriter(fh)
|
|
defer zz.Close()
|
|
|
|
tw := tar.NewWriter(zz)
|
|
defer tw.Close()
|
|
|
|
// tar
|
|
return filepath.Walk(src, func(file string, fi os.FileInfo, err error) error {
|
|
|
|
// return on any error
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !fi.Mode().IsRegular() {
|
|
return nil
|
|
}
|
|
|
|
header, err := tar.FileInfoHeader(fi, fi.Name())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// remove leading path to the src, so files are relative to the archive
|
|
path := strings.ReplaceAll(file, src, "")
|
|
if target != "" {
|
|
path = filepath.Join([]string{target, path}...)
|
|
}
|
|
path = strings.TrimPrefix(path, string(filepath.Separator))
|
|
|
|
header.Name = path
|
|
|
|
if err := tw.WriteHeader(header); err != nil {
|
|
return err
|
|
}
|
|
|
|
// copy the file contents
|
|
f, err := os.Open(file)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := io.Copy(tw, f); err != nil {
|
|
return err
|
|
}
|
|
|
|
f.Close()
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// filterServerMembers returns a slice of server member names matching the search criteria
|
|
func filterServerMembers(serverMembers *api.ServerMembers, serverIDs string, region string) (membersFound []string, err error) {
|
|
if serverMembers.Members == nil {
|
|
return nil, fmt.Errorf("Failed to parse server members, members==nil")
|
|
}
|
|
|
|
prefixes := stringToSlice(serverIDs)
|
|
|
|
// "leader" is a special case which Nomad handles in the API. If "leader"
|
|
// appears in serverIDs, add it to membersFound and remove it from the list
|
|
// so that it isn't processed by the range loop
|
|
if helper.SliceStringContains(prefixes, "leader") {
|
|
membersFound = append(membersFound, "leader")
|
|
helper.RemoveEqualFold(&prefixes, "leader")
|
|
}
|
|
|
|
for _, member := range serverMembers.Members {
|
|
// If region is provided it must match exactly
|
|
if region != "" && member.Tags["region"] != region {
|
|
continue
|
|
}
|
|
|
|
// Always include "all"
|
|
if serverIDs == "all" {
|
|
membersFound = append(membersFound, member.Name)
|
|
continue
|
|
}
|
|
|
|
// Include member if name matches any prefix from serverIDs
|
|
if helper.StringHasPrefixInSlice(member.Name, prefixes) {
|
|
membersFound = append(membersFound, member.Name)
|
|
}
|
|
}
|
|
|
|
return membersFound, nil
|
|
}
|
|
|
|
// stringToSlice splits comma-separated input string into slice, trims
|
|
// whitespace, and prunes empty values
|
|
func stringToSlice(input string) []string {
|
|
ns := strings.Split(input, ",")
|
|
var out []string
|
|
for _, n := range ns {
|
|
s := strings.TrimSpace(n)
|
|
if s == "" {
|
|
continue
|
|
}
|
|
out = append(out, s)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parseEventTopics(topicList []string) (map[api.Topic][]string, error) {
|
|
topics := make(map[api.Topic][]string)
|
|
|
|
var mErrs *multierror.Error
|
|
|
|
for _, topic := range topicList {
|
|
k, v, err := parseTopic(topic)
|
|
if err != nil {
|
|
mErrs = multierror.Append(mErrs, err)
|
|
}
|
|
|
|
topics[api.Topic(k)] = append(topics[api.Topic(k)], v)
|
|
}
|
|
|
|
return topics, mErrs.ErrorOrNil()
|
|
}
|
|
|
|
func parseTopic(input string) (string, string, error) {
|
|
var topic, filter string
|
|
|
|
parts := strings.Split(input, ":")
|
|
switch len(parts) {
|
|
case 1:
|
|
// infer wildcard if only given a topic
|
|
topic = input
|
|
filter = "*"
|
|
case 2:
|
|
topic = parts[0]
|
|
filter = parts[1]
|
|
default:
|
|
return "", "", fmt.Errorf("Invalid key value pair for topic: %s", topic)
|
|
}
|
|
|
|
return strings.Title(topic), filter, nil
|
|
}
|
|
|
|
func allTopics() map[api.Topic][]string {
|
|
return map[api.Topic][]string{"*": {"*"}}
|
|
}
|
|
|
|
// topicsFromString parses a comma separated list into a topicMap
|
|
func topicsFromString(topicList string) (map[api.Topic][]string, error) {
|
|
if topicList == "none" {
|
|
return nil, nil
|
|
}
|
|
if topicList == "all" {
|
|
return allTopics(), nil
|
|
}
|
|
|
|
topics := stringToSlice(topicList)
|
|
topicMap, err := parseEventTopics(topics)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return topicMap, nil
|
|
}
|
|
|
|
// external holds address configuration for Consul and Vault APIs
|
|
type external struct {
|
|
tls *api.TLSConfig
|
|
addrVal string
|
|
auth string
|
|
ssl bool
|
|
tokenVal string
|
|
tokenFile string
|
|
}
|
|
|
|
func (e *external) addr(defaultAddr string) string {
|
|
if e.addrVal == "" {
|
|
return defaultAddr
|
|
}
|
|
|
|
// Return address as-is if it contains a protocol
|
|
if strings.Contains(e.addrVal, "://") {
|
|
return e.addrVal
|
|
}
|
|
|
|
if e.ssl {
|
|
return "https://" + e.addrVal
|
|
}
|
|
|
|
return "http://" + e.addrVal
|
|
}
|
|
|
|
func (e *external) setAddr(addr string) {
|
|
// Handle no protocol scenario first
|
|
if !strings.Contains(addr, "://") {
|
|
e.addrVal = "http://" + addr
|
|
if e.ssl {
|
|
e.addrVal = "https://" + addr
|
|
}
|
|
return
|
|
}
|
|
|
|
// Set SSL boolean based on protocol
|
|
e.ssl = false
|
|
if strings.Contains(addr, "https") {
|
|
e.ssl = true
|
|
}
|
|
e.addrVal = addr
|
|
}
|
|
|
|
func (e *external) token() string {
|
|
if e.tokenVal != "" {
|
|
return e.tokenVal
|
|
}
|
|
|
|
if e.tokenFile != "" {
|
|
bs, err := ioutil.ReadFile(e.tokenFile)
|
|
if err == nil {
|
|
return strings.TrimSpace(string(bs))
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) getConsulAddrFromSelf(self *api.AgentSelf) string {
|
|
if self == nil {
|
|
return ""
|
|
}
|
|
|
|
var consulAddr string
|
|
r, ok := self.Config["Consul"]
|
|
if ok {
|
|
m, ok := r.(map[string]interface{})
|
|
if ok {
|
|
raw := m["EnableSSL"]
|
|
c.consul.ssl, _ = raw.(bool)
|
|
raw = m["Addr"]
|
|
c.consul.setAddr(raw.(string))
|
|
raw = m["Auth"]
|
|
c.consul.auth, _ = raw.(string)
|
|
raw = m["Token"]
|
|
c.consul.tokenVal = raw.(string)
|
|
|
|
consulAddr = c.consul.addr("")
|
|
}
|
|
}
|
|
return consulAddr
|
|
}
|
|
|
|
func (c *OperatorDebugCommand) getVaultAddrFromSelf(self *api.AgentSelf) string {
|
|
if self == nil {
|
|
return ""
|
|
}
|
|
|
|
var vaultAddr string
|
|
r, ok := self.Config["Vault"]
|
|
if ok {
|
|
m, ok := r.(map[string]interface{})
|
|
if ok {
|
|
raw := m["EnableSSL"]
|
|
c.vault.ssl, _ = raw.(bool)
|
|
raw = m["Addr"]
|
|
c.vault.setAddr(raw.(string))
|
|
raw = m["Auth"]
|
|
c.vault.auth, _ = raw.(string)
|
|
raw = m["Token"]
|
|
c.vault.tokenVal = raw.(string)
|
|
|
|
vaultAddr = c.vault.addr("")
|
|
}
|
|
}
|
|
return vaultAddr
|
|
}
|
|
|
|
// defaultHttpClient configures a basic httpClient
|
|
func defaultHttpClient() *http.Client {
|
|
httpClient := cleanhttp.DefaultClient()
|
|
transport := httpClient.Transport.(*http.Transport)
|
|
transport.TLSHandshakeTimeout = 10 * time.Second
|
|
transport.TLSClientConfig = &tls.Config{
|
|
MinVersion: tls.VersionTLS12,
|
|
}
|
|
|
|
return httpClient
|
|
}
|
|
|
|
// isRedirectError returns true if an error is a redirect error.
|
|
func isRedirectError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
const redirectErr string = `invalid character '<' looking for beginning of value`
|
|
return strings.Contains(err.Error(), redirectErr)
|
|
}
|