debug command archive content changes (#8462)

* command/debug: print interval data so the operator knows its waiting

* command/debug: use the Consul/Vault env for queries

* command/debug: capture the operator endpoints

* command/debug: capture API errors in the archive bundle
This commit is contained in:
Lang Martin 2020-08-11 13:14:28 -04:00 committed by GitHub
parent a27913e699
commit 1d7998f39f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 305 additions and 156 deletions

View file

@ -4,6 +4,7 @@ import (
"archive/tar"
"compress/gzip"
"context"
"crypto/tls"
"encoding/json"
"fmt"
"html/template"
@ -13,10 +14,12 @@ import (
"os"
"os/signal"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
"github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/nomad/api"
"github.com/posener/complete"
)
@ -24,18 +27,19 @@ import (
type DebugCommand struct {
Meta
timestamp string
collectDir string
duration time.Duration
interval time.Duration
logLevel string
nodeIDs []string
serverIDs []string
consulToken string
vaultToken string
manifest []string
ctx context.Context
cancel context.CancelFunc
timestamp string
collectDir string
duration time.Duration
interval time.Duration
logLevel string
stale bool
nodeIDs []string
serverIDs []string
consul *external
vault *external
manifest []string
ctx context.Context
cancel context.CancelFunc
}
const (
@ -55,34 +59,83 @@ General Options:
Debug Options:
-duration=2m
The duration of the log monitor command. Defaults to 2m.
-duration=<interval>
The duration of the log monitor command. Defaults to 2m.
-interval=2m
The interval between snapshots of the Nomad state. If unspecified, only one snapshot is
captured.
-interval=<interval>
The interval between snapshots of the Nomad state. If unspecified, only one snapshot is
captured.
-log-level=DEBUG
The log level to monitor. Defaults to DEBUG.
-log-level=<level>
The log level to monitor. Defaults to DEBUG.
-node-id=n1,n2
Comma separated list of Nomad client node ids, to monitor for logs and include pprof
profiles. Accepts id prefixes.
-node-id=<node>,<node>
Comma separated list of Nomad client node ids, to monitor for logs and include pprof
profiles. Accepts id prefixes.
-server-id=s1,s2
Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof
profiles.
-server-id=<server>,<server>
Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof
profiles.
-output=path
Path to the parent directory of the output directory. If not specified, an archive is built
in the current directory.
-stale=<true|false>
If "false", the default, get membership data from the cluster leader. If the cluster is in
an outage unable to establish leadership, it may be necessary to get the configuration from
a non-leader server.
-consul-token
Token used to query Consul. Defaults to CONSUL_HTTP_TOKEN or the contents of
CONSUL_HTTP_TOKEN_FILE
-output=<path>
Path to the parent directory of the output directory. If not specified, an archive is built
in the current directory.
-vault-token
Token used to query Vault. Defaults to VAULT_TOKEN
-consul-http-addr=<addr>
The address and port of the Consul HTTP agent. Can be specified by CONSUL_HTTP_ADDR
-consul-token=<token>
Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment
variable and the Consul token file.
-consul-token-file=<path>
Path to the Consul token file. Overrides the CONSUL_HTTP_TOKEN_FILE
environment variable.
-consul-client-cert=<path>
Path to the Consul client cert file. Overrides the CONSUL_CLIENT_CERT
environment variable.
-consul-client-key=<path>
Path to the Consul client key file. Overrides the CONSUL_CLIENT_KEY
environment variable.
-consul-ca-cert=<path>
Path to a CA file to use with Consul. Overrides the CONSUL_CACERT
environment variable and the Consul CA path.
-consul-ca-path=<path>
Path to a directory of PEM encoded CA cert files to verify the Consul
certificate. Overrides the CONSUL_CAPATH environment variable.
-vault-address=<addr>
The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR
environment variable.
-vault-token=<token>
Token used to query Vault. Overrides the VAULT_TOKEN environment
variable.
-vault-client-cert=<path>
Path to the Vault client cert file. Overrides the VAULT_CLIENT_CERT
environment variable.
-vault-client-key=<path>
Path to the Vault client key file. Overrides the VAULT_CLIENT_KEY
environment variable.
-vault-ca-cert=<path>
Path to a CA file to use with Vault. Overrides the VAULT_CACERT
environment variable and the Vault CA path.
-vault-ca-path=<path>
Path to a directory of PEM encoded CA cert files to verify the Vault
certificate. Overrides the VAULT_CAPATH environment variable.
`
return strings.TrimSpace(helpText)
}
@ -123,9 +176,28 @@ func (c *DebugCommand) Run(args []string) int {
flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
flags.StringVar(&nodeIDs, "node-id", "", "")
flags.StringVar(&serverIDs, "server-id", "", "")
flags.BoolVar(&c.stale, "stale", false, "")
flags.StringVar(&output, "output", "", "")
flags.StringVar(&c.consulToken, "consul-token", "", "")
flags.StringVar(&c.vaultToken, "vault-token", "", "")
c.consul = &external{tls: &api.TLSConfig{}}
flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "")
ssl := os.Getenv("CONSUL_HTTP_SSL")
c.consul.ssl, _ = strconv.ParseBool(ssl)
flags.StringVar(&c.consul.auth, "consul-auth", os.Getenv("CONSUL_HTTP_AUTH"), "")
flags.StringVar(&c.consul.tokenVal, "consul-token", os.Getenv("CONSUL_HTTP_TOKEN"), "")
flags.StringVar(&c.consul.tokenFile, "consul-token-file", os.Getenv("CONSUL_HTTP_TOKEN_FILE"), "")
flags.StringVar(&c.consul.tls.ClientCert, "consul-client-cert", os.Getenv("CONSUL_CLIENT_CERT"), "")
flags.StringVar(&c.consul.tls.ClientKey, "consul-client-key", os.Getenv("CONSUL_CLIENT_KEY"), "")
flags.StringVar(&c.consul.tls.CACert, "consul-ca-cert", os.Getenv("CONSUL_CACERT"), "")
flags.StringVar(&c.consul.tls.CAPath, "consul-ca-path", os.Getenv("CONSUL_CAPATH"), "")
c.vault = &external{tls: &api.TLSConfig{}}
flags.StringVar(&c.vault.addrVal, "vault-address", os.Getenv("VAULT_ADDR"), "")
flags.StringVar(&c.vault.tokenVal, "vault-token", os.Getenv("VAULT_TOKEN"), "")
flags.StringVar(&c.vault.tls.CACert, "vault-ca-cert", os.Getenv("VAULT_CACERT"), "")
flags.StringVar(&c.vault.tls.CAPath, "vault-ca-path", os.Getenv("VAULT_CAPATH"), "")
flags.StringVar(&c.vault.tls.ClientCert, "vault-client-cert", os.Getenv("VAULT_CLIENT_CERT"), "")
flags.StringVar(&c.vault.tls.ClientKey, "vault-client-key", os.Getenv("VAULT_CLIENT_KEY"), "")
if err := flags.Parse(args); err != nil {
return 1
@ -187,12 +259,20 @@ func (c *DebugCommand) Run(args []string) int {
ctx, cancel := context.WithCancel(context.Background())
c.ctx = ctx
c.cancel = cancel
c.trap()
// Setup the output path
format := "2006-01-02-150405Z"
c.timestamp = time.Now().UTC().Format(format)
stamped := "nomad-debug-" + c.timestamp
c.Ui.Output("Starting debugger and capturing cluster data...")
if len(c.nodeIDs) > 0 || len(c.serverIDs) > 0 {
c.Ui.Output(fmt.Sprintf(" Interval: '%s'", interval))
c.Ui.Output(fmt.Sprintf(" Duration: '%s'", duration))
}
// Create the output path
var tmp string
if output != "" {
tmp = filepath.Join(output, stamped)
@ -212,9 +292,6 @@ func (c *DebugCommand) Run(args []string) int {
c.collectDir = tmp
// Capture signals so we can shutdown the monitor API calls on Int
c.trap()
err = c.collect(client)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error()))
@ -229,7 +306,7 @@ func (c *DebugCommand) Run(args []string) int {
}
archiveFile := stamped + ".tar.gz"
err = TarCZF(archiveFile, tmp)
err = TarCZF(archiveFile, tmp, stamped)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error()))
return 2
@ -249,40 +326,46 @@ func (c *DebugCommand) collect(client *api.Client) error {
}
self, err := client.Agent().Self()
if err != nil {
return fmt.Errorf("agent self: %s", err.Error())
}
c.writeJSON(dir, "agent-self.json", self)
c.writeJSON(dir, "agent-self.json", self, err)
// Fetch data directly from consul and vault. Ignore errors
var consul, vault string
m, ok := self.Config["Consul"].(map[string]interface{})
if ok {
raw := m["Addr"]
consul, _ = raw.(string)
raw = m["EnableSSL"]
ssl, _ := raw.(bool)
if ssl {
consul = "https://" + consul
} else {
consul = "http://" + consul
}
}
if self != nil {
r, ok := self.Config["Consul"]
if ok {
m, ok := r.(map[string]interface{})
if ok {
m, ok = self.Config["Vault"].(map[string]interface{})
if ok {
raw := m["Addr"]
vault, _ = raw.(string)
raw := m["Addr"]
consul, _ = raw.(string)
raw = m["EnableSSL"]
ssl, _ := raw.(bool)
if ssl {
consul = "https://" + consul
} else {
consul = "http://" + consul
}
}
}
r, ok = self.Config["Vault"]
if ok {
m, ok := r.(map[string]interface{})
if ok {
raw := m["Addr"]
vault, _ = raw.(string)
}
}
}
c.collectConsul(dir, consul)
c.collectVault(dir, vault)
c.collectAgentHosts(client)
c.collectPprofs(client)
c.startMonitors(client)
c.collectPeriodic(client)
c.collectPprofs(client)
c.collectAgentHosts(client)
return nil
}
@ -372,12 +455,7 @@ func (c *DebugCommand) collectAgentHost(path, id string, client *api.Client) {
path = filepath.Join(path, id)
if err != nil {
c.writeBytes(path, "agent-host.log", []byte(err.Error()))
return
}
c.writeJSON(path, "agent-host.json", host)
c.writeJSON(path, "agent-host.json", host, err)
}
// collectPprofs captures the /agent/pprof for each listed node
@ -402,6 +480,10 @@ func (c *DebugCommand) collectPprof(path, id string, client *api.Client) {
}
path = filepath.Join(path, id)
err := c.mkdir(path)
if err != nil {
return
}
bs, err := client.Agent().CPUProfile(opts, nil)
if err == nil {
@ -424,7 +506,7 @@ func (c *DebugCommand) collectPprof(path, id string, client *api.Client) {
func (c *DebugCommand) collectPeriodic(client *api.Client) {
// Not monitoring any logs, just capture the nomad context before exit
if len(c.nodeIDs) == 0 && len(c.serverIDs) == 0 {
dir := filepath.Join("nomad", "00")
dir := filepath.Join("nomad", "0000")
c.collectNomad(dir, client)
return
}
@ -433,7 +515,7 @@ func (c *DebugCommand) collectPeriodic(client *api.Client) {
// Set interval to 0 so that we immediately execute, wait the interval next time
interval := time.After(0 * time.Second)
var intervalCount int
var dir string
var name, dir string
for {
select {
@ -442,8 +524,11 @@ func (c *DebugCommand) collectPeriodic(client *api.Client) {
return
case <-interval:
dir = filepath.Join("nomad", fmt.Sprintf("%02d", intervalCount))
name = fmt.Sprintf("%04d", intervalCount)
dir = filepath.Join("nomad", name)
c.Ui.Output(fmt.Sprintf(" Capture interval %s", name))
c.collectNomad(dir, client)
c.collectOperator(dir, client)
interval = time.After(c.interval)
intervalCount += 1
@ -453,6 +538,18 @@ func (c *DebugCommand) collectPeriodic(client *api.Client) {
}
}
// collectOperator captures some cluster meta information
func (c *DebugCommand) collectOperator(dir string, client *api.Client) {
rc, err := client.Operator().RaftGetConfiguration(nil)
c.writeJSON(dir, "operator-raft.json", rc, err)
sc, _, err := client.Operator().SchedulerGetConfiguration(nil)
c.writeJSON(dir, "operator-scheduler.json", sc, err)
ah, _, err := client.Operator().AutopilotServerHealth(nil)
c.writeJSON(dir, "operator-autopilot-health.json", ah, err)
}
// collectNomad captures the nomad cluster state
func (c *DebugCommand) collectNomad(dir string, client *api.Client) error {
err := c.mkdir(dir)
@ -463,85 +560,47 @@ func (c *DebugCommand) collectNomad(dir string, client *api.Client) error {
var qo *api.QueryOptions
js, _, err := client.Jobs().List(qo)
if err != nil {
return fmt.Errorf("listing jobs: %s", err.Error())
}
c.writeJSON(dir, "jobs.json", js)
c.writeJSON(dir, "jobs.json", js, err)
ds, _, err := client.Deployments().List(qo)
if err != nil {
return fmt.Errorf("listing deployments: %s", err.Error())
}
c.writeJSON(dir, "deployments.json", ds)
c.writeJSON(dir, "deployments.json", ds, err)
es, _, err := client.Evaluations().List(qo)
if err != nil {
return fmt.Errorf("listing evaluations: %s", err.Error())
}
c.writeJSON(dir, "evaluations.json", es)
c.writeJSON(dir, "evaluations.json", es, err)
as, _, err := client.Allocations().List(qo)
if err != nil {
return fmt.Errorf("listing allocations: %s", err.Error())
}
c.writeJSON(dir, "allocations.json", as)
c.writeJSON(dir, "allocations.json", as, err)
ns, _, err := client.Nodes().List(qo)
if err != nil {
return fmt.Errorf("listing nodes: %s", err.Error())
}
c.writeJSON(dir, "nodes.json", ns)
c.writeJSON(dir, "nodes.json", ns, err)
ps, _, err := client.CSIPlugins().List(qo)
if err != nil {
return fmt.Errorf("listing plugins: %s", err.Error())
}
c.writeJSON(dir, "plugins.json", ps)
c.writeJSON(dir, "plugins.json", ps, err)
vs, _, err := client.CSIVolumes().List(qo)
if err != nil {
return fmt.Errorf("listing volumes: %s", err.Error())
}
c.writeJSON(dir, "volumes.json", vs)
c.writeJSON(dir, "volumes.json", vs, err)
return nil
}
// collectConsul calls the Consul API directly to collect data
func (c *DebugCommand) collectConsul(dir, consul string) error {
if consul == "" {
addr := c.consul.addr(consul)
if addr == "" {
return nil
}
token := c.consulToken
if token == "" {
token = os.Getenv("CONSUL_HTTP_TOKEN")
}
if token == "" {
file := os.Getenv("CONSUL_HTTP_TOKEN_FILE")
if file != "" {
fh, err := os.Open(file)
if err == nil {
bs, err := ioutil.ReadAll(fh)
if err == nil {
token = strings.TrimSpace(string(bs))
}
}
}
}
client := defaultHttpClient()
api.ConfigureTLS(client, c.consul.tls)
client := http.Client{
Timeout: 2 * time.Second,
}
req, _ := http.NewRequest("GET", consul+"/v1/agent/self", nil)
req.Header.Add("X-Consul-Token", token)
req, _ := http.NewRequest("GET", addr+"/v1/agent/self", nil)
req.Header.Add("X-Consul-Token", c.consul.token())
req.Header.Add("User-Agent", userAgent)
resp, err := client.Do(req)
c.writeBody(dir, "consul-agent-self.json", resp, err)
req, _ = http.NewRequest("GET", consul+"/v1/agent/members", nil)
req.Header.Add("X-Consul-Token", token)
req, _ = http.NewRequest("GET", addr+"/v1/agent/members", nil)
req.Header.Add("X-Consul-Token", c.consul.token())
req.Header.Add("User-Agent", userAgent)
resp, err = client.Do(req)
c.writeBody(dir, "consul-agent-members.json", resp, err)
@ -551,21 +610,16 @@ func (c *DebugCommand) collectConsul(dir, consul string) error {
// collectVault calls the Vault API directly to collect data
func (c *DebugCommand) collectVault(dir, vault string) error {
if vault == "" {
addr := c.vault.addr(vault)
if addr == "" {
return nil
}
token := c.vaultToken
if token == "" {
os.Getenv("VAULT_TOKEN")
}
client := defaultHttpClient()
api.ConfigureTLS(client, c.vault.tls)
client := http.Client{
Timeout: 2 * time.Second,
}
req, _ := http.NewRequest("GET", vault+"/sys/health", nil)
req.Header.Add("X-Vault-Token", token)
req, _ := http.NewRequest("GET", addr+"/sys/health", nil)
req.Header.Add("X-Vault-Token", c.vault.token())
req.Header.Add("User-Agent", userAgent)
resp, err := client.Do(req)
c.writeBody(dir, "vault-sys-health.json", resp, err)
@ -590,28 +644,47 @@ func (c *DebugCommand) writeBytes(dir, file string, data []byte) error {
}
// writeJSON writes JSON responses from the Nomad API calls to the archive
func (c *DebugCommand) writeJSON(dir, file string, data interface{}) error {
func (c *DebugCommand) writeJSON(dir, file string, data interface{}, err error) error {
if err != nil {
return c.writeError(dir, file, err)
}
bytes, err := json.Marshal(data)
if err != nil {
return c.writeError(dir, file, err)
}
return c.writeBytes(dir, file, bytes)
}
// writeError writes a JSON error object to capture errors in the debug bundle without
// reporting
func (c *DebugCommand) writeError(dir, file string, err error) error {
bytes, err := json.Marshal(errorWrapper{Error: err.Error()})
if err != nil {
return err
}
return c.writeBytes(dir, file, bytes)
}
type errorWrapper struct {
Error string
}
// writeBody is a helper that writes the body of an http.Response to the archive
func (c *DebugCommand) writeBody(dir, file string, resp *http.Response, err error) {
if err != nil {
c.writeError(dir, file, err)
return
}
defer resp.Body.Close()
if resp.ContentLength == 0 {
return
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
c.writeError(dir, file, err)
}
c.writeBytes(dir, file, body)
@ -669,8 +742,8 @@ func (c *DebugCommand) trap() {
}
// TarCZF, like the tar command, recursively builds a gzip compressed tar archive from a
// directory
func TarCZF(archive string, src string) error {
// directory. If not empty, all files in the bundle are prefixed with the target path
func TarCZF(archive string, src, target string) error {
// ensure the src actually exists before trying to tar it
if _, err := os.Stat(src); err != nil {
return fmt.Errorf("Unable to tar files - %v", err.Error())
@ -707,7 +780,13 @@ func TarCZF(archive string, src string) error {
}
// remove leading path to the src, so files are relative to the archive
header.Name = strings.TrimPrefix(strings.Replace(file, src, "", -1), string(filepath.Separator))
path := strings.Replace(file, src, "", -1)
if target != "" {
path = filepath.Join([]string{target, path}...)
}
path = strings.TrimPrefix(path, string(filepath.Separator))
header.Name = path
if err := tw.WriteHeader(header); err != nil {
return err
@ -742,3 +821,63 @@ func argNodes(input string) []string {
}
return out
}
// external holds address configuration for Consul and Vault APIs
type external struct {
tls *api.TLSConfig
addrVal string
auth string
ssl bool
tokenVal string
tokenFile string
}
func (e *external) addr(defaultAddr string) string {
if e.addrVal == "" {
return defaultAddr
}
if !e.ssl {
if strings.HasPrefix(e.addrVal, "http:") {
return e.addrVal
}
return "http://" + e.addrVal
}
if strings.HasPrefix(e.addrVal, "https:") {
return e.addrVal
}
if strings.HasPrefix(e.addrVal, "http:") {
return "https:" + e.addrVal[5:]
}
return "https://" + e.addrVal
}
func (e *external) token() string {
if e.tokenVal != "" {
return e.tokenVal
}
if e.tokenFile != "" {
bs, err := ioutil.ReadFile(e.tokenFile)
if err == nil {
return strings.TrimSpace(string(bs))
}
}
return ""
}
// defaultHttpClient configures a basic httpClient
func defaultHttpClient() *http.Client {
httpClient := cleanhttp.DefaultClient()
transport := httpClient.Transport.(*http.Transport)
transport.TLSHandshakeTimeout = 10 * time.Second
transport.TLSClientConfig = &tls.Config{
MinVersion: tls.VersionTLS12,
}
return httpClient
}

View file

@ -17,6 +17,19 @@ func TestDebugUtils(t *testing.T) {
xs = argNodes("")
require.Len(t, xs, 0)
require.Empty(t, xs)
// address calculation honors CONSUL_HTTP_SSL
e := &external{addrVal: "http://127.0.0.1:8500", ssl: true}
require.Equal(t, "https://127.0.0.1:8500", e.addr("foo"))
e = &external{addrVal: "http://127.0.0.1:8500", ssl: false}
require.Equal(t, "http://127.0.0.1:8500", e.addr("foo"))
e = &external{addrVal: "127.0.0.1:8500", ssl: false}
require.Equal(t, "http://127.0.0.1:8500", e.addr("foo"))
e = &external{addrVal: "127.0.0.1:8500", ssl: true}
require.Equal(t, "https://127.0.0.1:8500", e.addr("foo"))
}
func TestDebugFails(t *testing.T) {
@ -65,20 +78,17 @@ func TestDebugCapturedFiles(t *testing.T) {
ui := new(cli.MockUi)
cmd := &DebugCommand{Meta: Meta{Ui: ui}}
// Calculate the output name
format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
path := filepath.Join(os.TempDir(), stamped)
defer os.Remove(path)
code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-server-id", "leader",
"-duration", "1s",
"-interval", "500ms",
"-duration", "1300ms",
"-interval", "600ms",
})
path := cmd.collectDir
defer os.Remove(path)
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
ui.ErrorWriter.Reset()
@ -86,11 +96,11 @@ func TestDebugCapturedFiles(t *testing.T) {
// Version is always captured
require.FileExists(t, filepath.Join(path, "version", "agent-self.json"))
// Consul and Vault are only captured if they exist
// Consul and Vault contain results or errors
_, err := os.Stat(filepath.Join(path, "version", "consul-agent-self.json"))
require.Error(t, err)
require.NoError(t, err)
_, err = os.Stat(filepath.Join(path, "version", "vault-sys-health.json"))
require.Error(t, err)
require.NoError(t, err)
// Monitor files are only created when selected
require.FileExists(t, filepath.Join(path, "server", "leader", "monitor.log"))
@ -99,10 +109,10 @@ func TestDebugCapturedFiles(t *testing.T) {
require.FileExists(t, filepath.Join(path, "server", "leader", "goroutine.prof"))
// Multiple snapshots are collected, 00 is always created
require.FileExists(t, filepath.Join(path, "nomad", "00", "jobs.json"))
require.FileExists(t, filepath.Join(path, "nomad", "00", "nodes.json"))
require.FileExists(t, filepath.Join(path, "nomad", "0000", "jobs.json"))
require.FileExists(t, filepath.Join(path, "nomad", "0000", "nodes.json"))
// Multiple snapshots are collected, 01 requires two intervals
require.FileExists(t, filepath.Join(path, "nomad", "01", "jobs.json"))
require.FileExists(t, filepath.Join(path, "nomad", "01", "nodes.json"))
require.FileExists(t, filepath.Join(path, "nomad", "0001", "jobs.json"))
require.FileExists(t, filepath.Join(path, "nomad", "0001", "nodes.json"))
}