2020-06-25 16:51:23 +00:00
package command
import (
"archive/tar"
"compress/gzip"
"context"
2020-08-11 17:14:28 +00:00
"crypto/tls"
2020-06-25 16:51:23 +00:00
"encoding/json"
2022-01-18 02:35:51 +00:00
"errors"
2021-11-05 23:43:10 +00:00
"flag"
2020-06-25 16:51:23 +00:00
"fmt"
"html/template"
"io"
"io/ioutil"
"net/http"
"os"
"os/signal"
"path/filepath"
2020-08-11 17:14:28 +00:00
"strconv"
2020-06-25 16:51:23 +00:00
"strings"
"syscall"
"time"
2020-08-11 17:14:28 +00:00
"github.com/hashicorp/go-cleanhttp"
2022-01-18 02:35:51 +00:00
"github.com/hashicorp/go-multierror"
2020-06-25 16:51:23 +00:00
"github.com/hashicorp/nomad/api"
2021-10-13 00:01:54 +00:00
"github.com/hashicorp/nomad/api/contexts"
2020-12-01 17:36:05 +00:00
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
2021-11-05 23:43:10 +00:00
"github.com/hashicorp/nomad/version"
2020-06-25 16:51:23 +00:00
"github.com/posener/complete"
)
2020-08-11 19:39:44 +00:00
type OperatorDebugCommand struct {
2020-06-25 16:51:23 +00:00
Meta
2020-12-01 17:36:05 +00:00
timestamp string
collectDir string
duration time . Duration
interval time . Duration
pprofDuration time . Duration
logLevel string
maxNodes int
nodeClass string
nodeIDs [ ] string
serverIDs [ ] string
2022-01-18 02:35:51 +00:00
topics map [ api . Topic ] [ ] string
index uint64
2020-12-01 17:36:05 +00:00
consul * external
vault * external
manifest [ ] string
ctx context . Context
cancel context . CancelFunc
2021-12-15 15:44:03 +00:00
opts * api . QueryOptions
2022-01-18 02:35:51 +00:00
verbose bool
2020-06-25 16:51:23 +00:00
}
const (
2021-10-13 22:00:55 +00:00
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
2020-06-25 16:51:23 +00:00
)
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Help ( ) string {
2020-06-25 16:51:23 +00:00
helpText := `
2020-08-11 19:39:44 +00:00
Usage : nomad operator debug [ options ]
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Build an archive containing Nomad cluster configuration and state , and Consul
and Vault status . Include logs and pprof profiles for selected servers and
client nodes .
2020-06-25 16:51:23 +00:00
2020-11-19 21:38:08 +00:00
If ACLs are enabled , this command will require a token with the ' node : read '
capability to run . In order to collect information , the token will also
require the ' agent : read ' and ' operator : read ' capabilities , as well as the
2020-12-01 17:36:05 +00:00
' list - jobs ' capability for all namespaces . To collect pprof profiles the
2021-03-09 13:31:38 +00:00
token will also require ' agent : write ' , or enable_debug configuration set to
true .
2020-11-19 21:38:08 +00:00
2022-01-18 02:35:51 +00:00
If event stream capture is enabled , the Job , Allocation , Deployment ,
and Evaluation topics require ' namespace : read - job ' capabilities , the Node
topic requires ' node : read ' . A ' management ' token is required to capture
ACLToken , ACLPolicy , or all all events .
2020-06-25 16:51:23 +00:00
General Options :
2020-11-19 16:15:23 +00:00
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Consul Options :
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
- consul - http - addr = < addr >
2021-03-09 13:31:38 +00:00
The address and port of the Consul HTTP agent . Overrides the
CONSUL_HTTP_ADDR environment variable .
2020-08-11 17:14:28 +00:00
- consul - token = < token >
Token used to query Consul . Overrides the CONSUL_HTTP_TOKEN environment
variable and the Consul token file .
- consul - token - file = < path >
Path to the Consul token file . Overrides the CONSUL_HTTP_TOKEN_FILE
environment variable .
- consul - client - cert = < path >
Path to the Consul client cert file . Overrides the CONSUL_CLIENT_CERT
environment variable .
- consul - client - key = < path >
Path to the Consul client key file . Overrides the CONSUL_CLIENT_KEY
environment variable .
- consul - ca - cert = < path >
Path to a CA file to use with Consul . Overrides the CONSUL_CACERT
environment variable and the Consul CA path .
- consul - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Consul
certificate . Overrides the CONSUL_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Vault Options :
2020-08-11 17:14:28 +00:00
- vault - address = < addr >
The address and port of the Vault HTTP agent . Overrides the VAULT_ADDR
environment variable .
- vault - token = < token >
Token used to query Vault . Overrides the VAULT_TOKEN environment
variable .
- vault - client - cert = < path >
Path to the Vault client cert file . Overrides the VAULT_CLIENT_CERT
environment variable .
- vault - client - key = < path >
Path to the Vault client key file . Overrides the VAULT_CLIENT_KEY
environment variable .
- vault - ca - cert = < path >
Path to a CA file to use with Vault . Overrides the VAULT_CACERT
environment variable and the Vault CA path .
- vault - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Vault
certificate . Overrides the VAULT_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Debug Options :
- duration = < duration >
2021-10-27 17:43:56 +00:00
Set the duration of the debug capture . Logs will be captured from specified servers and
2022-01-18 02:35:51 +00:00
nodes at "log-level" . Defaults to 2 m .
- event - index = < index >
Specifies the index to start streaming events from . If the requested index is
no longer in the buffer the stream will start at the next available index .
Defaults to 0.
- event - topic = < Allocation , Evaluation , Job , Node , * > : < filter >
Enable event stream capture , filtered by comma delimited list of topic filters .
Examples :
"all" or "*:*" for all events
"Evaluation" or "Evaluation:*" for all evaluation events
"*:example" for all events related to the job "example"
Defaults to "none" ( disabled ) .
2021-03-09 13:31:38 +00:00
- interval = < interval >
2021-12-15 15:44:03 +00:00
The interval between snapshots of the Nomad state . Set interval equal to
2021-03-09 13:31:38 +00:00
duration to capture a single snapshot . Defaults to 30 s .
- log - level = < level >
The log level to monitor . Defaults to DEBUG .
- max - nodes = < count >
Cap the maximum number of client nodes included in the capture . Defaults
to 10 , set to 0 for unlimited .
2021-10-27 17:43:56 +00:00
- node - id = < node1 > , < node2 >
2021-03-09 13:31:38 +00:00
Comma separated list of Nomad client node ids to monitor for logs , API
outputs , and pprof profiles . Accepts id prefixes , and "all" to select all
nodes ( up to count = max - nodes ) . Defaults to "all" .
- node - class = < node - class >
Filter client nodes based on node class .
- pprof - duration = < duration >
Duration for pprof collection . Defaults to 1 s .
2021-10-27 17:43:56 +00:00
- server - id = < server1 > , < server2 >
2021-03-09 13:31:38 +00:00
Comma separated list of Nomad server names to monitor for logs , API
outputs , and pprof profiles . Accepts server names , "leader" , or "all" .
Defaults to "all" .
- stale = < true | false >
If "false" , the default , get membership data from the cluster leader . If
the cluster is in an outage unable to establish leadership , it may be
necessary to get the configuration from a non - leader server .
- output = < path >
2021-12-15 15:44:03 +00:00
Path to the parent directory of the output directory . If specified , no
2022-01-18 02:35:51 +00:00
archive is built . Defaults to the current directory .
- verbose
Enable verbose output .
2020-06-25 16:51:23 +00:00
`
return strings . TrimSpace ( helpText )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Synopsis ( ) string {
2020-06-25 16:51:23 +00:00
return "Build a debug archive"
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteFlags ( ) complete . Flags {
2020-06-25 16:51:23 +00:00
return mergeAutocompleteFlags ( c . Meta . AutocompleteFlags ( FlagSetClient ) ,
complete . Flags {
2020-12-01 17:36:05 +00:00
"-duration" : complete . PredictAnything ,
2022-01-18 02:35:51 +00:00
"-event-index" : complete . PredictAnything ,
"-event-topic" : complete . PredictAnything ,
2020-12-01 17:36:05 +00:00
"-interval" : complete . PredictAnything ,
2021-10-13 00:01:54 +00:00
"-log-level" : complete . PredictSet ( "TRACE" , "DEBUG" , "INFO" , "WARN" , "ERROR" ) ,
2020-12-01 17:36:05 +00:00
"-max-nodes" : complete . PredictAnything ,
2021-10-13 00:01:54 +00:00
"-node-class" : NodeClassPredictor ( c . Client ) ,
"-node-id" : NodePredictor ( c . Client ) ,
"-server-id" : ServerPredictor ( c . Client ) ,
"-output" : complete . PredictDirs ( "*" ) ,
2020-12-01 17:36:05 +00:00
"-pprof-duration" : complete . PredictAnything ,
"-consul-token" : complete . PredictAnything ,
"-vault-token" : complete . PredictAnything ,
2022-01-18 02:35:51 +00:00
"-verbose" : complete . PredictAnything ,
2020-06-25 16:51:23 +00:00
} )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteArgs ( ) complete . Predictor {
2020-06-25 16:51:23 +00:00
return complete . PredictNothing
}
2021-10-13 00:01:54 +00:00
// NodePredictor returns a client node predictor
func NodePredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
resp , _ , err := client . Search ( ) . PrefixSearch (
a . Last , contexts . Nodes , & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
return resp . Matches [ contexts . Nodes ]
} )
}
// NodeClassPredictor returns a client node class predictor
2022-01-18 02:35:51 +00:00
// TODO dmay: Consider API options for node class filtering
2021-10-13 00:01:54 +00:00
func NodeClassPredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
nodes , _ , err := client . Nodes ( ) . List ( & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
// Build map of unique node classes across all nodes
classes := make ( map [ string ] bool )
for _ , node := range nodes {
classes [ node . NodeClass ] = true
}
// Iterate over node classes looking for match
filtered := [ ] string { }
for class := range classes {
if strings . HasPrefix ( class , a . Last ) {
filtered = append ( filtered , class )
}
}
return filtered
} )
}
// ServerPredictor returns a server member predictor
2022-01-18 02:35:51 +00:00
// TODO dmay: Consider API options for server member filtering
2021-10-13 00:01:54 +00:00
func ServerPredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
members , err := client . Agent ( ) . MembersOpts ( & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
// Iterate over server members looking for match
filtered := [ ] string { }
for _ , member := range members . Members {
if strings . HasPrefix ( member . Name , a . Last ) {
filtered = append ( filtered , member . Name )
}
}
return filtered
} )
}
2021-12-15 15:44:03 +00:00
// queryOpts returns a copy of the shared api.QueryOptions so
// that api package methods can safely modify the options
func ( c * OperatorDebugCommand ) queryOpts ( ) * api . QueryOptions {
qo := new ( api . QueryOptions )
* qo = * c . opts
qo . Params = helper . CopyMapStringString ( c . opts . Params )
return qo
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Name ( ) string { return "debug" }
2020-06-25 16:51:23 +00:00
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Run ( args [ ] string ) int {
2020-06-25 16:51:23 +00:00
flags := c . Meta . FlagSet ( c . Name ( ) , FlagSetClient )
flags . Usage = func ( ) { c . Ui . Output ( c . Help ( ) ) }
2022-01-18 02:35:51 +00:00
var duration , interval , output , pprofDuration , eventTopic string
var eventIndex int64
2020-06-25 16:51:23 +00:00
var nodeIDs , serverIDs string
2021-12-15 15:44:03 +00:00
var allowStale bool
2020-06-25 16:51:23 +00:00
flags . StringVar ( & duration , "duration" , "2m" , "" )
2022-01-18 02:35:51 +00:00
flags . Int64Var ( & eventIndex , "event-index" , 0 , "" )
flags . StringVar ( & eventTopic , "event-topic" , "none" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & interval , "interval" , "30s" , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & c . logLevel , "log-level" , "DEBUG" , "" )
2020-11-12 16:25:28 +00:00
flags . IntVar ( & c . maxNodes , "max-nodes" , 10 , "" )
flags . StringVar ( & c . nodeClass , "node-class" , "" , "" )
2021-10-27 17:43:56 +00:00
flags . StringVar ( & nodeIDs , "node-id" , "all" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & serverIDs , "server-id" , "all" , "" )
2021-12-15 15:44:03 +00:00
flags . BoolVar ( & allowStale , "stale" , false , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & output , "output" , "" , "" )
2020-12-01 17:36:05 +00:00
flags . StringVar ( & pprofDuration , "pprof-duration" , "1s" , "" )
2022-01-18 02:35:51 +00:00
flags . BoolVar ( & c . verbose , "verbose" , false , "" )
2020-08-11 17:14:28 +00:00
c . consul = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . consul . addrVal , "consul-http-addr" , os . Getenv ( "CONSUL_HTTP_ADDR" ) , "" )
ssl := os . Getenv ( "CONSUL_HTTP_SSL" )
c . consul . ssl , _ = strconv . ParseBool ( ssl )
flags . StringVar ( & c . consul . auth , "consul-auth" , os . Getenv ( "CONSUL_HTTP_AUTH" ) , "" )
flags . StringVar ( & c . consul . tokenVal , "consul-token" , os . Getenv ( "CONSUL_HTTP_TOKEN" ) , "" )
flags . StringVar ( & c . consul . tokenFile , "consul-token-file" , os . Getenv ( "CONSUL_HTTP_TOKEN_FILE" ) , "" )
flags . StringVar ( & c . consul . tls . ClientCert , "consul-client-cert" , os . Getenv ( "CONSUL_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . consul . tls . ClientKey , "consul-client-key" , os . Getenv ( "CONSUL_CLIENT_KEY" ) , "" )
flags . StringVar ( & c . consul . tls . CACert , "consul-ca-cert" , os . Getenv ( "CONSUL_CACERT" ) , "" )
flags . StringVar ( & c . consul . tls . CAPath , "consul-ca-path" , os . Getenv ( "CONSUL_CAPATH" ) , "" )
c . vault = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . vault . addrVal , "vault-address" , os . Getenv ( "VAULT_ADDR" ) , "" )
flags . StringVar ( & c . vault . tokenVal , "vault-token" , os . Getenv ( "VAULT_TOKEN" ) , "" )
flags . StringVar ( & c . vault . tls . CACert , "vault-ca-cert" , os . Getenv ( "VAULT_CACERT" ) , "" )
flags . StringVar ( & c . vault . tls . CAPath , "vault-ca-path" , os . Getenv ( "VAULT_CAPATH" ) , "" )
flags . StringVar ( & c . vault . tls . ClientCert , "vault-client-cert" , os . Getenv ( "VAULT_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . vault . tls . ClientKey , "vault-client-key" , os . Getenv ( "VAULT_CLIENT_KEY" ) , "" )
2020-06-25 16:51:23 +00:00
if err := flags . Parse ( args ) ; err != nil {
2020-10-06 02:30:01 +00:00
c . Ui . Error ( fmt . Sprintf ( "Error parsing arguments: %q" , err ) )
2020-06-25 16:51:23 +00:00
return 1
}
2020-11-12 16:25:28 +00:00
// Parse the capture duration
2020-06-25 16:51:23 +00:00
d , err := time . ParseDuration ( duration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing duration: %s: %s" , duration , err . Error ( ) ) )
return 1
}
c . duration = d
2020-11-12 16:25:28 +00:00
// Parse the capture interval
2020-06-25 16:51:23 +00:00
i , err := time . ParseDuration ( interval )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s: %s" , interval , err . Error ( ) ) )
return 1
}
c . interval = i
2021-03-09 13:31:38 +00:00
// Validate interval
if i . Seconds ( ) > d . Seconds ( ) {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s is greater than duration %s" , interval , duration ) )
return 1
}
2020-12-01 17:36:05 +00:00
// Parse the pprof capture duration
pd , err := time . ParseDuration ( pprofDuration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing pprof duration: %s: %s" , pprofDuration , err . Error ( ) ) )
return 1
}
c . pprofDuration = pd
2022-01-18 02:35:51 +00:00
// Parse event stream topic filter
t , err := topicsFromString ( eventTopic )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing event topics: %v" , err ) )
return 1
}
c . topics = t
// Validate and set initial event stream index
if eventIndex < 0 {
c . Ui . Error ( "Event stream index must be greater than zero" )
return 1
}
c . index = uint64 ( eventIndex )
2020-11-12 16:25:28 +00:00
// Verify there are no extra arguments
2020-06-25 16:51:23 +00:00
args = flags . Args ( )
if l := len ( args ) ; l != 0 {
c . Ui . Error ( "This command takes no arguments" )
c . Ui . Error ( commandErrorText ( c ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Initialize capture variables and structs
c . manifest = make ( [ ] string , 0 )
ctx , cancel := context . WithCancel ( context . Background ( ) )
c . ctx = ctx
c . cancel = cancel
c . trap ( )
// Generate timestamped file name
format := "2006-01-02-150405Z"
c . timestamp = time . Now ( ) . UTC ( ) . Format ( format )
stamped := "nomad-debug-" + c . timestamp
// Create the output directory
var tmp string
if output != "" {
// User specified output directory
tmp = filepath . Join ( output , stamped )
_ , err := os . Stat ( tmp )
if ! os . IsNotExist ( err ) {
c . Ui . Error ( "Output directory already exists" )
return 2
}
} else {
// Generate temp directory
tmp , err = ioutil . TempDir ( os . TempDir ( ) , stamped )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating tmp directory: %s" , err . Error ( ) ) )
return 2
}
defer os . RemoveAll ( tmp )
}
c . collectDir = tmp
2021-11-05 23:43:10 +00:00
// Write CLI flags to JSON file
c . writeFlags ( flags )
2020-11-12 16:25:28 +00:00
// Create an instance of the API client
2020-06-25 16:51:23 +00:00
client , err := c . Meta . Client ( )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error initializing client: %s" , err . Error ( ) ) )
return 1
}
2021-12-15 15:44:03 +00:00
c . opts = & api . QueryOptions {
Region : c . Meta . region ,
AllowStale : allowStale ,
AuthToken : c . Meta . token ,
}
2020-11-12 16:25:28 +00:00
// Search all nodes If a node class is specified without a list of node id prefixes
if c . nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
}
// Resolve client node id prefixes
nodesFound := 0
nodeLookupFailCount := 0
nodeCaptureCount := 0
2021-10-12 20:58:41 +00:00
for _ , id := range stringToSlice ( nodeIDs ) {
2020-11-12 16:25:28 +00:00
if id == "all" {
// Capture from all nodes using empty prefix filter
id = ""
} else {
// Capture from nodes starting with prefix id
id = sanitizeUUIDPrefix ( id )
}
2021-12-15 15:44:03 +00:00
nodes , _ , err := client . Nodes ( ) . PrefixListOpts ( id , c . queryOpts ( ) )
2020-06-25 16:51:23 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error querying node info: %s" , err ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Increment fail count if no nodes are found
2020-12-14 20:02:48 +00:00
if len ( nodes ) == 0 {
2020-06-25 16:51:23 +00:00
c . Ui . Error ( fmt . Sprintf ( "No node(s) with prefix %q found" , id ) )
2020-11-12 16:25:28 +00:00
nodeLookupFailCount ++
continue
2020-06-25 16:51:23 +00:00
}
2020-12-14 20:02:48 +00:00
nodesFound += len ( nodes )
2020-11-12 16:25:28 +00:00
// Apply constraints to nodes found
2020-06-25 16:51:23 +00:00
for _ , n := range nodes {
2020-11-12 16:25:28 +00:00
// Ignore nodes that do not match specified class
if c . nodeClass != "" && n . NodeClass != c . nodeClass {
continue
}
// Add node to capture list
2020-06-25 16:51:23 +00:00
c . nodeIDs = append ( c . nodeIDs , n . ID )
2020-11-12 16:25:28 +00:00
nodeCaptureCount ++
// Stop looping when we reach the max
if c . maxNodes != 0 && nodeCaptureCount >= c . maxNodes {
break
}
2020-06-25 16:51:23 +00:00
}
}
2020-11-12 16:25:28 +00:00
// Return error if nodes were specified but none were found
if len ( nodeIDs ) > 0 && nodeCaptureCount == 0 {
2021-10-27 17:43:56 +00:00
if nodeIDs == "all" {
// It's okay to have zero clients for default "all"
c . Ui . Info ( "Note: \"-node-id=all\" specified but no clients found" )
} else {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve clients, 0 nodes found in list: %s" , nodeIDs ) )
return 1
}
2020-11-12 16:25:28 +00:00
}
2020-10-14 19:16:10 +00:00
// Resolve servers
2021-12-15 15:44:03 +00:00
members , err := client . Agent ( ) . MembersOpts ( c . queryOpts ( ) )
2020-11-12 16:25:28 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve server list; err: %v" , err ) )
return 1
}
2021-10-12 20:58:41 +00:00
// Write complete list of server members to file
2021-10-13 22:00:55 +00:00
c . writeJSON ( clusterDir , "members.json" , members , err )
2021-10-12 20:58:41 +00:00
// Filter for servers matching criteria
c . serverIDs , err = filterServerMembers ( members , serverIDs , c . region )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Failed to parse server list; err: %v" , err ) )
return 1
2020-10-14 19:16:10 +00:00
}
2020-11-12 16:25:28 +00:00
serversFound := 0
serverCaptureCount := 0
if members != nil {
serversFound = len ( members . Members )
}
if c . serverIDs != nil {
serverCaptureCount = len ( c . serverIDs )
}
2020-10-14 19:16:10 +00:00
// Return error if servers were specified but not found
2020-11-12 16:25:28 +00:00
if len ( serverIDs ) > 0 && serverCaptureCount == 0 {
2020-10-14 19:16:10 +00:00
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve servers, 0 members found in list: %s" , serverIDs ) )
return 1
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
// Display general info about the capture
c . Ui . Output ( "Starting debugger..." )
c . Ui . Output ( "" )
2021-11-05 23:43:10 +00:00
c . Ui . Output ( fmt . Sprintf ( "Nomad CLI Version: %s" , version . GetVersion ( ) . FullVersionNumber ( true ) ) )
2021-10-12 20:58:41 +00:00
c . Ui . Output ( fmt . Sprintf ( " Region: %s" , c . region ) )
c . Ui . Output ( fmt . Sprintf ( " Namespace: %s" , c . namespace ) )
2020-11-12 16:25:28 +00:00
c . Ui . Output ( fmt . Sprintf ( " Servers: (%d/%d) %v" , serverCaptureCount , serversFound , c . serverIDs ) )
c . Ui . Output ( fmt . Sprintf ( " Clients: (%d/%d) %v" , nodeCaptureCount , nodesFound , c . nodeIDs ) )
if nodeCaptureCount > 0 && nodeCaptureCount == c . maxNodes {
c . Ui . Output ( fmt . Sprintf ( " Max node count reached (%d)" , c . maxNodes ) )
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
if nodeLookupFailCount > 0 {
c . Ui . Output ( fmt . Sprintf ( "Client fail count: %v" , nodeLookupFailCount ) )
}
if c . nodeClass != "" {
c . Ui . Output ( fmt . Sprintf ( " Node Class: %s" , c . nodeClass ) )
}
c . Ui . Output ( fmt . Sprintf ( " Interval: %s" , interval ) )
c . Ui . Output ( fmt . Sprintf ( " Duration: %s" , duration ) )
2020-12-01 17:36:05 +00:00
if c . pprofDuration . Seconds ( ) != 1 {
c . Ui . Output ( fmt . Sprintf ( " pprof Duration: %s" , c . pprofDuration ) )
}
2022-01-18 02:35:51 +00:00
if c . topics != nil {
c . Ui . Output ( fmt . Sprintf ( " Event topics: %+v" , c . topics ) )
}
2020-11-12 16:25:28 +00:00
c . Ui . Output ( "" )
c . Ui . Output ( "Capturing cluster data..." )
2020-06-25 16:51:23 +00:00
2020-11-12 16:25:28 +00:00
// Start collecting data
2020-06-25 16:51:23 +00:00
err = c . collect ( client )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error collecting data: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Write index json/html manifest files
2020-06-25 16:51:23 +00:00
c . writeManifest ( )
2020-11-12 16:25:28 +00:00
// Exit before archive if output directory was specified
2020-06-25 16:51:23 +00:00
if output != "" {
c . Ui . Output ( fmt . Sprintf ( "Created debug directory: %s" , c . collectDir ) )
return 0
}
2020-11-12 16:25:28 +00:00
// Create archive tarball
2020-06-25 16:51:23 +00:00
archiveFile := stamped + ".tar.gz"
2020-08-11 17:14:28 +00:00
err = TarCZF ( archiveFile , tmp , stamped )
2020-06-25 16:51:23 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating archive: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Final output with name of tarball
2020-06-25 16:51:23 +00:00
c . Ui . Output ( fmt . Sprintf ( "Created debug archive: %s" , archiveFile ) )
return 0
}
// collect collects data from our endpoints and writes the archive bundle
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collect ( client * api . Client ) error {
2022-01-18 02:35:51 +00:00
// Start background captures
c . startMonitors ( client )
c . startEventStream ( client )
2020-06-25 16:51:23 +00:00
2022-01-18 02:35:51 +00:00
// Collect cluster data
2020-06-25 16:51:23 +00:00
self , err := client . Agent ( ) . Self ( )
2021-10-13 22:00:55 +00:00
c . writeJSON ( clusterDir , "agent-self.json" , self , err )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
namespaces , _ , err := client . Namespaces ( ) . List ( c . queryOpts ( ) )
2021-10-13 22:00:55 +00:00
c . writeJSON ( clusterDir , "namespaces.json" , namespaces , err )
2021-10-12 20:58:41 +00:00
regions , err := client . Regions ( ) . List ( )
2021-10-13 22:00:55 +00:00
c . writeJSON ( clusterDir , "regions.json" , regions , err )
2021-10-12 20:58:41 +00:00
2021-11-05 23:43:10 +00:00
// Collect data from Consul
if c . consul . addrVal == "" {
c . getConsulAddrFromSelf ( self )
}
c . collectConsul ( clusterDir )
2020-06-25 16:51:23 +00:00
2021-11-05 23:43:10 +00:00
// Collect data from Vault
vaultAddr := c . vault . addrVal
if vaultAddr == "" {
vaultAddr = c . getVaultAddrFromSelf ( self )
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
c . collectVault ( clusterDir , vaultAddr )
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
c . collectAgentHosts ( client )
c . collectPprofs ( client )
2020-06-25 16:51:23 +00:00
c . collectPeriodic ( client )
return nil
}
// path returns platform specific paths in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) path ( paths ... string ) string {
2020-06-25 16:51:23 +00:00
ps := [ ] string { c . collectDir }
ps = append ( ps , paths ... )
return filepath . Join ( ps ... )
}
// mkdir creates directories in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) mkdir ( paths ... string ) error {
2020-12-01 17:36:05 +00:00
joinedPath := c . path ( paths ... )
// Ensure path doesn't escape the sandbox of the capture directory
escapes := helper . PathEscapesSandbox ( c . collectDir , joinedPath )
if escapes {
return fmt . Errorf ( "file path escapes capture directory" )
}
return os . MkdirAll ( joinedPath , 0755 )
2020-06-25 16:51:23 +00:00
}
// startMonitors starts go routines for each node and client
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitors ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
for _ , id := range c . nodeIDs {
2021-10-13 22:00:55 +00:00
go c . startMonitor ( clientDir , "node_id" , id , client )
2020-06-25 16:51:23 +00:00
}
for _ , id := range c . serverIDs {
2021-10-13 22:00:55 +00:00
go c . startMonitor ( serverDir , "server_id" , id , client )
2020-06-25 16:51:23 +00:00
}
}
// startMonitor starts one monitor api request, writing to a file. It blocks and should be
// called in a go routine. Errors are ignored, we want to build the archive even if a node
// is unavailable
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitor ( path , idKey , nodeID string , client * api . Client ) {
2020-06-25 16:51:23 +00:00
c . mkdir ( path , nodeID )
fh , err := os . Create ( c . path ( path , nodeID , "monitor.log" ) )
if err != nil {
return
}
defer fh . Close ( )
qo := api . QueryOptions {
Params : map [ string ] string {
idKey : nodeID ,
"log_level" : c . logLevel ,
} ,
2021-12-15 15:44:03 +00:00
AllowStale : c . queryOpts ( ) . AllowStale ,
2020-06-25 16:51:23 +00:00
}
outCh , errCh := client . Agent ( ) . Monitor ( c . ctx . Done ( ) , & qo )
for {
select {
case out := <- outCh :
if out == nil {
continue
}
fh . Write ( out . Data )
case err := <- errCh :
fh . WriteString ( fmt . Sprintf ( "monitor: %s\n" , err . Error ( ) ) )
return
case <- c . ctx . Done ( ) :
return
}
}
}
2022-01-18 02:35:51 +00:00
// captureEventStream wraps the event stream capture process.
func ( c * OperatorDebugCommand ) startEventStream ( client * api . Client ) {
c . verboseOut ( "Launching eventstream goroutine..." )
go func ( ) {
if err := c . captureEventStream ( client ) ; err != nil {
var es string
if mErr , ok := err . ( * multierror . Error ) ; ok {
es = multierror . ListFormatFunc ( mErr . Errors )
} else {
es = err . Error ( )
}
c . Ui . Error ( fmt . Sprintf ( "Error capturing event stream: %s" , es ) )
}
} ( )
}
func ( c * OperatorDebugCommand ) captureEventStream ( client * api . Client ) error {
// Ensure output directory is present
path := clusterDir
if err := c . mkdir ( c . path ( path ) ) ; err != nil {
return err
}
// Create the output file
fh , err := os . Create ( c . path ( path , "eventstream.json" ) )
if err != nil {
return err
}
defer fh . Close ( )
// Get handle to events endpoint
events := client . EventStream ( )
// Start streaming events
eventCh , err := events . Stream ( c . ctx , c . topics , c . index , c . queryOpts ( ) )
if err != nil {
if errors . Is ( err , context . Canceled ) {
c . verboseOut ( "Event stream canceled: No events captured" )
return nil
}
return fmt . Errorf ( "failed to stream events: %w" , err )
}
eventCount := 0
errCount := 0
heartbeatCount := 0
channelEventCount := 0
var mErrs * multierror . Error
for {
select {
case event := <- eventCh :
channelEventCount ++
if event . Err != nil {
errCount ++
c . verboseOutf ( "error from event stream: index; %d err: %v" , event . Index , event . Err )
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "error at index: %d, Err: %w" , event . Index , event . Err ) )
break
}
if event . IsHeartbeat ( ) {
heartbeatCount ++
continue
}
for _ , e := range event . Events {
eventCount ++
c . verboseOutf ( "Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s" , eventCount , e . Index , e . Topic , e . Type , e . FilterKeys )
bytes , err := json . Marshal ( e )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to marshal json from Topic: %s, Type: %s, Err: %w" , e . Topic , e . Type , err ) )
}
n , err := fh . Write ( bytes )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to write bytes to eventstream.json; bytes written: %d, Err: %w" , n , err ) )
break
}
n , err = fh . WriteString ( "\n" )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to write string to eventstream.json; chars written: %d, Err: %w" , n , err ) )
}
}
case <- c . ctx . Done ( ) :
c . verboseOutf ( "Event stream captured %d events, %d frames, %d heartbeats, %d errors" , eventCount , channelEventCount , heartbeatCount , errCount )
return mErrs . ErrorOrNil ( )
}
}
}
2020-07-02 13:51:25 +00:00
// collectAgentHosts calls collectAgentHost for each selected node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHosts ( client * api . Client ) {
2020-07-02 13:51:25 +00:00
for _ , n := range c . nodeIDs {
2021-10-13 22:00:55 +00:00
c . collectAgentHost ( clientDir , n , client )
2020-07-02 13:51:25 +00:00
}
for _ , n := range c . serverIDs {
2021-10-13 22:00:55 +00:00
c . collectAgentHost ( serverDir , n , client )
2020-07-02 13:51:25 +00:00
}
}
// collectAgentHost gets the agent host data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHost ( path , id string , client * api . Client ) {
2020-07-02 13:51:25 +00:00
var host * api . HostDataResponse
var err error
2021-10-13 22:00:55 +00:00
if path == serverDir {
2021-12-15 15:44:03 +00:00
host , err = client . Agent ( ) . Host ( id , "" , c . queryOpts ( ) )
2020-07-02 13:51:25 +00:00
} else {
2021-12-15 15:44:03 +00:00
host , err = client . Agent ( ) . Host ( "" , id , c . queryOpts ( ) )
2020-07-02 13:51:25 +00:00
}
2022-01-17 16:15:17 +00:00
if isRedirectError ( err ) {
c . Ui . Warn ( fmt . Sprintf ( "%s/%s: /v1/agent/host unavailable on this agent" , path , id ) )
return
}
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s/%s: Failed to retrieve agent host data, err: %v" , path , id , err ) )
2020-07-02 13:51:25 +00:00
2020-12-01 17:36:05 +00:00
if strings . Contains ( err . Error ( ) , structs . ErrPermissionDenied . Error ( ) ) {
// Drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information." )
2020-12-01 17:36:05 +00:00
}
return // exit on any error
}
path = filepath . Join ( path , id )
2020-08-11 17:14:28 +00:00
c . writeJSON ( path , "agent-host.json" , host , err )
2020-07-02 13:51:25 +00:00
}
2020-06-25 16:51:23 +00:00
// collectPprofs captures the /agent/pprof for each listed node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPprofs ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
for _ , n := range c . nodeIDs {
2021-10-13 22:00:55 +00:00
c . collectPprof ( clientDir , n , client )
2020-06-25 16:51:23 +00:00
}
for _ , n := range c . serverIDs {
2021-10-13 22:00:55 +00:00
c . collectPprof ( serverDir , n , client )
2020-06-25 16:51:23 +00:00
}
}
// collectPprof captures pprof data for the node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPprof ( path , id string , client * api . Client ) {
2020-12-01 17:36:05 +00:00
pprofDurationSeconds := int ( c . pprofDuration . Seconds ( ) )
opts := api . PprofOptions { Seconds : pprofDurationSeconds }
2021-10-13 22:00:55 +00:00
if path == serverDir {
2020-06-25 16:51:23 +00:00
opts . ServerID = id
} else {
opts . NodeID = id
}
path = filepath . Join ( path , id )
2021-12-15 15:44:03 +00:00
bs , err := client . Agent ( ) . CPUProfile ( opts , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof profile.prof, err: %v" , path , err ) )
if structs . IsErrPermissionDenied ( err ) {
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
// But lets first drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information." )
2020-12-01 17:36:05 +00:00
return // only exit on 403
}
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "profile.prof" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
2021-06-21 18:22:49 +00:00
// goroutine debug type 1 = legacy text format for human readable output
opts . Debug = 1
c . savePprofProfile ( path , "goroutine" , opts , client )
// goroutine debug type 2 = goroutine stacks in panic format
opts . Debug = 2
c . savePprofProfile ( path , "goroutine" , opts , client )
// Reset to pprof binary format
opts . Debug = 0
c . savePprofProfile ( path , "goroutine" , opts , client ) // Stack traces of all current goroutines
c . savePprofProfile ( path , "trace" , opts , client ) // A trace of execution of the current program
c . savePprofProfile ( path , "heap" , opts , client ) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c . savePprofProfile ( path , "allocs" , opts , client ) // A sampling of all past memory allocations
c . savePprofProfile ( path , "threadcreate" , opts , client ) // Stack traces that led to the creation of new OS threads
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}
// savePprofProfile retrieves a pprof profile and writes to disk
func ( c * OperatorDebugCommand ) savePprofProfile ( path string , profile string , opts api . PprofOptions , client * api . Client ) {
fileName := fmt . Sprintf ( "%s.prof" , profile )
if opts . Debug > 0 {
fileName = fmt . Sprintf ( "%s-debug%d.txt" , profile , opts . Debug )
2020-06-25 16:51:23 +00:00
}
2021-12-15 15:44:03 +00:00
bs , err := retrievePprofProfile ( profile , opts , client , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
if err != nil {
2021-06-21 18:22:49 +00:00
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof %s, err: %s" , path , fileName , err . Error ( ) ) )
2020-06-25 16:51:23 +00:00
}
2020-10-14 19:16:10 +00:00
2021-06-21 18:22:49 +00:00
err = c . writeBytes ( path , fileName , bs )
2020-12-01 17:36:05 +00:00
if err != nil {
2021-06-21 18:22:49 +00:00
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to write file %s, err: %s" , path , fileName , err . Error ( ) ) )
2020-10-14 19:16:10 +00:00
}
2021-06-21 18:22:49 +00:00
}
2020-10-14 19:16:10 +00:00
2021-12-15 15:44:03 +00:00
// retrievePprofProfile gets a pprof profile from the node specified
// in opts using the API client
func retrievePprofProfile ( profile string , opts api . PprofOptions , client * api . Client , qopts * api . QueryOptions ) ( bs [ ] byte , err error ) {
2021-06-21 18:22:49 +00:00
switch profile {
case "cpuprofile" :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . CPUProfile ( opts , qopts )
2021-06-21 18:22:49 +00:00
case "trace" :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . Trace ( opts , qopts )
2021-06-21 18:22:49 +00:00
default :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . Lookup ( profile , opts , qopts )
2020-10-14 19:16:10 +00:00
}
2021-06-21 18:22:49 +00:00
return bs , err
2020-06-25 16:51:23 +00:00
}
2021-12-15 15:44:03 +00:00
// collectPeriodic runs for duration, capturing the cluster state
// every interval. It flushes and stops the monitor requests
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPeriodic ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
duration := time . After ( c . duration )
// Set interval to 0 so that we immediately execute, wait the interval next time
interval := time . After ( 0 * time . Second )
var intervalCount int
2020-08-11 17:14:28 +00:00
var name , dir string
2020-06-25 16:51:23 +00:00
for {
select {
case <- duration :
c . cancel ( )
return
case <- interval :
2020-08-11 17:14:28 +00:00
name = fmt . Sprintf ( "%04d" , intervalCount )
2021-10-13 22:00:55 +00:00
dir = filepath . Join ( intervalDir , name )
2020-08-11 17:14:28 +00:00
c . Ui . Output ( fmt . Sprintf ( " Capture interval %s" , name ) )
2020-06-25 16:51:23 +00:00
c . collectNomad ( dir , client )
2020-08-11 17:14:28 +00:00
c . collectOperator ( dir , client )
2020-06-25 16:51:23 +00:00
interval = time . After ( c . interval )
2020-12-01 17:36:05 +00:00
intervalCount ++
2020-06-25 16:51:23 +00:00
case <- c . ctx . Done ( ) :
return
}
}
}
2020-08-11 17:14:28 +00:00
// collectOperator captures some cluster meta information
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectOperator ( dir string , client * api . Client ) {
2021-12-15 15:44:03 +00:00
rc , err := client . Operator ( ) . RaftGetConfiguration ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "operator-raft.json" , rc , err )
2021-12-15 15:44:03 +00:00
sc , _ , err := client . Operator ( ) . SchedulerGetConfiguration ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "operator-scheduler.json" , sc , err )
2021-12-15 15:44:03 +00:00
ah , _ , err := client . Operator ( ) . AutopilotServerHealth ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "operator-autopilot-health.json" , ah , err )
2020-08-31 17:22:23 +00:00
2021-12-15 15:44:03 +00:00
lic , _ , err := client . Operator ( ) . LicenseGet ( c . queryOpts ( ) )
2020-08-31 17:22:23 +00:00
c . writeJSON ( dir , "license.json" , lic , err )
2020-08-11 17:14:28 +00:00
}
2020-06-25 16:51:23 +00:00
// collectNomad captures the nomad cluster state
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectNomad ( dir string , client * api . Client ) error {
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
js , _ , err := client . Jobs ( ) . List ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "jobs.json" , js , err )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
ds , _ , err := client . Deployments ( ) . List ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "deployments.json" , ds , err )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
es , _ , err := client . Evaluations ( ) . List ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "evaluations.json" , es , err )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
as , _ , err := client . Allocations ( ) . List ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "allocations.json" , as , err )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
ns , _ , err := client . Nodes ( ) . List ( c . queryOpts ( ) )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "nodes.json" , ns , err )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugins - /v1/plugins?type=csi
2021-12-15 15:44:03 +00:00
ps , _ , err := client . CSIPlugins ( ) . List ( c . queryOpts ( ) )
2021-10-13 22:00:55 +00:00
c . writeJSON ( dir , "csi-plugins.json" , ps , err )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugin details - /v1/plugin/csi/:plugin_id
for _ , p := range ps {
2021-12-15 15:44:03 +00:00
csiPlugin , _ , err := client . CSIPlugins ( ) . Info ( p . ID , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
csiPluginFileName := fmt . Sprintf ( "csi-plugin-id-%s.json" , p . ID )
c . writeJSON ( dir , csiPluginFileName , csiPlugin , err )
}
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Volumes - /v1/volumes?type=csi
2021-12-15 15:44:03 +00:00
csiVolumes , _ , err := client . CSIVolumes ( ) . List ( c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
c . writeJSON ( dir , "csi-volumes.json" , csiVolumes , err )
// CSI Volume details - /v1/volumes/csi/:volume-id
for _ , v := range csiVolumes {
2021-12-15 15:44:03 +00:00
csiVolume , _ , err := client . CSIVolumes ( ) . Info ( v . ID , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
csiFileName := fmt . Sprintf ( "csi-volume-id-%s.json" , v . ID )
c . writeJSON ( dir , csiFileName , csiVolume , err )
2020-10-14 19:16:10 +00:00
}
2020-10-06 02:30:01 +00:00
2021-12-15 15:44:03 +00:00
metrics , _ , err := client . Operator ( ) . MetricsSummary ( c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
c . writeJSON ( dir , "metrics.json" , metrics , err )
2020-06-25 16:51:23 +00:00
return nil
}
2021-11-05 23:43:10 +00:00
// collectConsul calls the Consul API to collect data
func ( c * OperatorDebugCommand ) collectConsul ( dir string ) {
if c . consul . addrVal == "" {
c . Ui . Output ( "Consul - Skipping, no API address found" )
return
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
c . Ui . Info ( fmt . Sprintf ( "Consul - Collecting Consul API data from: %s" , c . consul . addrVal ) )
client , err := c . consulAPIClient ( )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "failed to create Consul API client: %s" , err ) )
return
}
// Exit if we are unable to retrieve the leader
err = c . collectConsulAPIRequest ( client , "/v1/status/leader" , dir , "consul-leader.json" )
if err != nil {
c . Ui . Output ( fmt . Sprintf ( "Unable to contact Consul leader, skipping: %s" , err ) )
return
}
c . collectConsulAPI ( client , "/v1/agent/host" , dir , "consul-agent-host.json" )
c . collectConsulAPI ( client , "/v1/agent/members" , dir , "consul-agent-members.json" )
c . collectConsulAPI ( client , "/v1/agent/metrics" , dir , "consul-agent-metrics.json" )
c . collectConsulAPI ( client , "/v1/agent/self" , dir , "consul-agent-self.json" )
}
func ( c * OperatorDebugCommand ) consulAPIClient ( ) ( * http . Client , error ) {
httpClient := defaultHttpClient ( )
err := api . ConfigureTLS ( httpClient , c . consul . tls )
if err != nil {
return nil , fmt . Errorf ( "failed to configure TLS: %w" , err )
}
return httpClient , nil
}
func ( c * OperatorDebugCommand ) collectConsulAPI ( client * http . Client , urlPath string , dir string , file string ) {
err := c . collectConsulAPIRequest ( client , urlPath , dir , file )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error collecting from Consul API: %s" , err . Error ( ) ) )
}
}
func ( c * OperatorDebugCommand ) collectConsulAPIRequest ( client * http . Client , urlPath string , dir string , file string ) error {
url := c . consul . addrVal + urlPath
req , err := http . NewRequest ( "GET" , url , nil )
if err != nil {
return fmt . Errorf ( "failed to create HTTP request for Consul API URL=%q: %w" , url , err )
}
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req . Header . Add ( "X-Consul-Token" , c . consul . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
2021-11-05 23:43:10 +00:00
2020-06-25 16:51:23 +00:00
resp , err := client . Do ( req )
2021-11-05 23:43:10 +00:00
if err != nil {
return err
}
2020-06-25 16:51:23 +00:00
2021-11-05 23:43:10 +00:00
c . writeBody ( dir , file , resp , err )
2020-06-25 16:51:23 +00:00
return nil
}
// collectVault calls the Vault API directly to collect data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectVault ( dir , vault string ) error {
2021-11-05 23:43:10 +00:00
vaultAddr := c . vault . addr ( vault )
if vaultAddr == "" {
2020-06-25 16:51:23 +00:00
return nil
}
2021-11-05 23:43:10 +00:00
c . Ui . Info ( fmt . Sprintf ( "Vault - Collecting Vault API data from: %s" , vaultAddr ) )
2020-08-11 17:14:28 +00:00
client := defaultHttpClient ( )
2021-11-05 23:43:10 +00:00
if c . vault . ssl {
err := api . ConfigureTLS ( client , c . vault . tls )
if err != nil {
return fmt . Errorf ( "failed to configure TLS: %w" , err )
}
}
req , err := http . NewRequest ( "GET" , vaultAddr + "/v1/sys/health" , nil )
if err != nil {
return fmt . Errorf ( "failed to create HTTP request for Vault API URL=%q: %w" , vaultAddr , err )
}
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req . Header . Add ( "X-Vault-Token" , c . vault . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
resp , err := client . Do ( req )
c . writeBody ( dir , "vault-sys-health.json" , resp , err )
return nil
}
// writeBytes writes a file to the archive, recording it in the manifest
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBytes ( dir , file string , data [ ] byte ) error {
2020-12-01 17:36:05 +00:00
// Replace invalid characters in filename
filename := helper . CleanFilename ( file , "_" )
relativePath := filepath . Join ( dir , filename )
2020-10-14 19:16:10 +00:00
c . manifest = append ( c . manifest , relativePath )
dirPath := filepath . Join ( c . collectDir , dir )
2020-12-01 17:36:05 +00:00
filePath := filepath . Join ( dirPath , filename )
2020-10-14 19:16:10 +00:00
// Ensure parent directories exist
err := os . MkdirAll ( dirPath , os . ModePerm )
if err != nil {
2020-12-01 17:36:05 +00:00
return fmt . Errorf ( "failed to create parent directories of \"%s\": %w" , dirPath , err )
}
// Ensure filename doesn't escape the sandbox of the capture directory
escapes := helper . PathEscapesSandbox ( c . collectDir , filePath )
if escapes {
return fmt . Errorf ( "file path \"%s\" escapes capture directory \"%s\"" , filePath , c . collectDir )
2020-10-14 19:16:10 +00:00
}
2020-06-25 16:51:23 +00:00
2020-10-14 19:16:10 +00:00
// Create the file
fh , err := os . Create ( filePath )
2020-06-25 16:51:23 +00:00
if err != nil {
2020-12-01 17:36:05 +00:00
return fmt . Errorf ( "failed to create file \"%s\", err: %w" , filePath , err )
2020-06-25 16:51:23 +00:00
}
defer fh . Close ( )
_ , err = fh . Write ( data )
2020-12-01 17:36:05 +00:00
if err != nil {
return fmt . Errorf ( "Failed to write data to file \"%s\", err: %w" , filePath , err )
}
return nil
2020-06-25 16:51:23 +00:00
}
// writeJSON writes JSON responses from the Nomad API calls to the archive
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeJSON ( dir , file string , data interface { } , err error ) error {
2020-08-11 17:14:28 +00:00
if err != nil {
return c . writeError ( dir , file , err )
}
2020-06-25 16:51:23 +00:00
bytes , err := json . Marshal ( data )
2020-08-11 17:14:28 +00:00
if err != nil {
return c . writeError ( dir , file , err )
}
2020-12-01 17:36:05 +00:00
err = c . writeBytes ( dir , file , bytes )
if err != nil {
c . Ui . Error ( err . Error ( ) )
}
return nil
2020-08-11 17:14:28 +00:00
}
// writeError writes a JSON error object to capture errors in the debug bundle without
// reporting
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeError ( dir , file string , err error ) error {
2020-08-11 17:14:28 +00:00
bytes , err := json . Marshal ( errorWrapper { Error : err . Error ( ) } )
2020-06-25 16:51:23 +00:00
if err != nil {
return err
}
return c . writeBytes ( dir , file , bytes )
}
2020-08-11 17:14:28 +00:00
type errorWrapper struct {
Error string
}
2020-06-25 16:51:23 +00:00
// writeBody is a helper that writes the body of an http.Response to the archive
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBody ( dir , file string , resp * http . Response , err error ) {
2020-06-25 16:51:23 +00:00
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-06-25 16:51:23 +00:00
return
}
if resp . ContentLength == 0 {
return
}
2020-08-11 17:14:28 +00:00
defer resp . Body . Close ( )
2020-06-25 16:51:23 +00:00
body , err := ioutil . ReadAll ( resp . Body )
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-12-01 17:36:05 +00:00
return
2020-06-25 16:51:23 +00:00
}
2020-12-01 17:36:05 +00:00
if err := c . writeBytes ( dir , file , body ) ; err != nil {
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
type flagExport struct {
Name string
Parsed bool
Actual map [ string ] * flag . Flag
Formal map [ string ] * flag . Flag
Effective map [ string ] * flag . Flag // All flags with non-empty value
Args [ ] string // arguments after flags
OsArgs [ ] string
}
// writeFlags exports the CLI flags to JSON file
func ( c * OperatorDebugCommand ) writeFlags ( flags * flag . FlagSet ) {
// c.writeJSON(clusterDir, "cli-flags-complete.json", flags, nil)
var f flagExport
f . Name = flags . Name ( )
f . Parsed = flags . Parsed ( )
f . Formal = make ( map [ string ] * flag . Flag )
f . Actual = make ( map [ string ] * flag . Flag )
f . Effective = make ( map [ string ] * flag . Flag )
f . Args = flags . Args ( )
f . OsArgs = os . Args
// Formal flags (all flags)
flags . VisitAll ( func ( flagA * flag . Flag ) {
f . Formal [ flagA . Name ] = flagA
// Determine which of thees are "effective" flags by comparing to empty string
if flagA . Value . String ( ) != "" {
f . Effective [ flagA . Name ] = flagA
}
} )
// Actual flags (everything passed on cmdline)
flags . Visit ( func ( flag * flag . Flag ) {
f . Actual [ flag . Name ] = flag
} )
c . writeJSON ( clusterDir , "cli-flags.json" , f , nil )
}
2020-06-25 16:51:23 +00:00
// writeManifest creates the index files
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeManifest ( ) error {
2020-06-25 16:51:23 +00:00
// Write the JSON
path := filepath . Join ( c . collectDir , "index.json" )
jsonFh , err := os . Create ( path )
if err != nil {
return err
}
defer jsonFh . Close ( )
json . NewEncoder ( jsonFh ) . Encode ( c . manifest )
// Write the HTML
path = filepath . Join ( c . collectDir , "index.html" )
htmlFh , err := os . Create ( path )
if err != nil {
return err
}
defer htmlFh . Close ( )
head , _ := template . New ( "head" ) . Parse ( "<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>" )
line , _ := template . New ( "line" ) . Parse ( "<li><a href=\"{{.}}\">{{.}}</a></li>\n" )
if err != nil {
return fmt . Errorf ( "%v" , err )
}
tail := "</ul></body></html>\n"
head . Execute ( htmlFh , c . timestamp )
for _ , f := range c . manifest {
line . Execute ( htmlFh , f )
}
htmlFh . WriteString ( tail )
return nil
}
// trap captures signals, and closes stopCh
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) trap ( ) {
2020-06-25 16:51:23 +00:00
sigCh := make ( chan os . Signal , 1 )
signal . Notify ( sigCh ,
syscall . SIGHUP ,
syscall . SIGINT ,
syscall . SIGTERM ,
syscall . SIGQUIT )
go func ( ) {
<- sigCh
c . cancel ( )
} ( )
}
2022-01-18 02:35:51 +00:00
func ( c * OperatorDebugCommand ) verboseOut ( out string ) {
if c . verbose {
c . Ui . Output ( out )
}
}
func ( c * OperatorDebugCommand ) verboseOutf ( format string , a ... interface { } ) {
c . verboseOut ( fmt . Sprintf ( format , a ... ) )
}
2021-08-30 09:08:12 +00:00
// TarCZF like the tar command, recursively builds a gzip compressed tar
// archive from a directory. If not empty, all files in the bundle are prefixed
// with the target path.
2020-08-11 17:14:28 +00:00
func TarCZF ( archive string , src , target string ) error {
2020-06-25 16:51:23 +00:00
// ensure the src actually exists before trying to tar it
if _ , err := os . Stat ( src ) ; err != nil {
return fmt . Errorf ( "Unable to tar files - %v" , err . Error ( ) )
}
// create the archive
fh , err := os . Create ( archive )
if err != nil {
return err
}
defer fh . Close ( )
zz := gzip . NewWriter ( fh )
defer zz . Close ( )
tw := tar . NewWriter ( zz )
defer tw . Close ( )
// tar
return filepath . Walk ( src , func ( file string , fi os . FileInfo , err error ) error {
// return on any error
if err != nil {
return err
}
if ! fi . Mode ( ) . IsRegular ( ) {
return nil
}
header , err := tar . FileInfoHeader ( fi , fi . Name ( ) )
if err != nil {
return err
}
// remove leading path to the src, so files are relative to the archive
2021-04-03 07:50:23 +00:00
path := strings . ReplaceAll ( file , src , "" )
2020-08-11 17:14:28 +00:00
if target != "" {
path = filepath . Join ( [ ] string { target , path } ... )
}
path = strings . TrimPrefix ( path , string ( filepath . Separator ) )
header . Name = path
2020-06-25 16:51:23 +00:00
if err := tw . WriteHeader ( header ) ; err != nil {
return err
}
// copy the file contents
f , err := os . Open ( file )
if err != nil {
return err
}
if _ , err := io . Copy ( tw , f ) ; err != nil {
return err
}
f . Close ( )
return nil
} )
}
2021-10-12 20:58:41 +00:00
// filterServerMembers returns a slice of server member names matching the search criteria
func filterServerMembers ( serverMembers * api . ServerMembers , serverIDs string , region string ) ( membersFound [ ] string , err error ) {
if serverMembers . Members == nil {
return nil , fmt . Errorf ( "Failed to parse server members, members==nil" )
}
prefixes := stringToSlice ( serverIDs )
// "leader" is a special case which Nomad handles in the API. If "leader"
// appears in serverIDs, add it to membersFound and remove it from the list
// so that it isn't processed by the range loop
if helper . SliceStringContains ( prefixes , "leader" ) {
membersFound = append ( membersFound , "leader" )
helper . RemoveEqualFold ( & prefixes , "leader" )
}
for _ , member := range serverMembers . Members {
// If region is provided it must match exactly
if region != "" && member . Tags [ "region" ] != region {
continue
}
// Always include "all"
if serverIDs == "all" {
membersFound = append ( membersFound , member . Name )
continue
}
// Include member if name matches any prefix from serverIDs
if helper . StringHasPrefixInSlice ( member . Name , prefixes ) {
membersFound = append ( membersFound , member . Name )
}
}
return membersFound , nil
}
// stringToSlice splits comma-separated input string into slice, trims
// whitespace, and prunes empty values
func stringToSlice ( input string ) [ ] string {
2020-06-25 16:51:23 +00:00
ns := strings . Split ( input , "," )
var out [ ] string
for _ , n := range ns {
s := strings . TrimSpace ( n )
if s == "" {
continue
}
out = append ( out , s )
}
return out
}
2020-08-11 17:14:28 +00:00
2022-01-18 02:35:51 +00:00
func parseEventTopics ( topicList [ ] string ) ( map [ api . Topic ] [ ] string , error ) {
topics := make ( map [ api . Topic ] [ ] string )
var mErrs * multierror . Error
for _ , topic := range topicList {
k , v , err := parseTopic ( topic )
if err != nil {
mErrs = multierror . Append ( mErrs , err )
}
topics [ api . Topic ( k ) ] = append ( topics [ api . Topic ( k ) ] , v )
}
return topics , mErrs . ErrorOrNil ( )
}
func parseTopic ( input string ) ( string , string , error ) {
var topic , filter string
parts := strings . Split ( input , ":" )
switch len ( parts ) {
case 1 :
// infer wildcard if only given a topic
topic = input
filter = "*"
case 2 :
topic = parts [ 0 ]
filter = parts [ 1 ]
default :
return "" , "" , fmt . Errorf ( "Invalid key value pair for topic: %s" , topic )
}
return strings . Title ( topic ) , filter , nil
}
func allTopics ( ) map [ api . Topic ] [ ] string {
return map [ api . Topic ] [ ] string { "*" : { "*" } }
}
// topicsFromString parses a comma separated list into a topicMap
func topicsFromString ( topicList string ) ( map [ api . Topic ] [ ] string , error ) {
if topicList == "none" {
return nil , nil
}
if topicList == "all" {
return allTopics ( ) , nil
}
topics := stringToSlice ( topicList )
topicMap , err := parseEventTopics ( topics )
if err != nil {
return nil , err
}
return topicMap , nil
}
2020-08-11 17:14:28 +00:00
// external holds address configuration for Consul and Vault APIs
type external struct {
tls * api . TLSConfig
addrVal string
auth string
ssl bool
tokenVal string
tokenFile string
}
func ( e * external ) addr ( defaultAddr string ) string {
if e . addrVal == "" {
return defaultAddr
}
2021-11-05 23:43:10 +00:00
// Return address as-is if it contains a protocol
if strings . Contains ( e . addrVal , "://" ) {
return e . addrVal
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
if e . ssl {
return "https://" + e . addrVal
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
return "http://" + e . addrVal
}
func ( e * external ) setAddr ( addr string ) {
// Handle no protocol scenario first
if ! strings . Contains ( addr , "://" ) {
e . addrVal = "http://" + addr
if e . ssl {
e . addrVal = "https://" + addr
}
return
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
// Set SSL boolean based on protocol
e . ssl = false
if strings . Contains ( addr , "https" ) {
e . ssl = true
}
e . addrVal = addr
2020-08-11 17:14:28 +00:00
}
func ( e * external ) token ( ) string {
if e . tokenVal != "" {
return e . tokenVal
}
if e . tokenFile != "" {
bs , err := ioutil . ReadFile ( e . tokenFile )
if err == nil {
return strings . TrimSpace ( string ( bs ) )
}
}
return ""
}
2021-11-05 23:43:10 +00:00
func ( c * OperatorDebugCommand ) getConsulAddrFromSelf ( self * api . AgentSelf ) string {
if self == nil {
return ""
}
var consulAddr string
r , ok := self . Config [ "Consul" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "EnableSSL" ]
c . consul . ssl , _ = raw . ( bool )
raw = m [ "Addr" ]
c . consul . setAddr ( raw . ( string ) )
raw = m [ "Auth" ]
c . consul . auth , _ = raw . ( string )
raw = m [ "Token" ]
c . consul . tokenVal = raw . ( string )
consulAddr = c . consul . addr ( "" )
}
}
return consulAddr
}
func ( c * OperatorDebugCommand ) getVaultAddrFromSelf ( self * api . AgentSelf ) string {
if self == nil {
return ""
}
var vaultAddr string
r , ok := self . Config [ "Vault" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "EnableSSL" ]
c . vault . ssl , _ = raw . ( bool )
raw = m [ "Addr" ]
c . vault . setAddr ( raw . ( string ) )
raw = m [ "Auth" ]
c . vault . auth , _ = raw . ( string )
raw = m [ "Token" ]
c . vault . tokenVal = raw . ( string )
vaultAddr = c . vault . addr ( "" )
}
}
return vaultAddr
}
2020-08-11 17:14:28 +00:00
// defaultHttpClient configures a basic httpClient
func defaultHttpClient ( ) * http . Client {
httpClient := cleanhttp . DefaultClient ( )
transport := httpClient . Transport . ( * http . Transport )
transport . TLSHandshakeTimeout = 10 * time . Second
transport . TLSClientConfig = & tls . Config {
MinVersion : tls . VersionTLS12 ,
}
return httpClient
}
2022-01-17 16:15:17 +00:00
// isRedirectError returns true if an error is a redirect error.
func isRedirectError ( err error ) bool {
if err == nil {
return false
}
const redirectErr string = ` invalid character '<' looking for beginning of value `
return strings . Contains ( err . Error ( ) , redirectErr )
}