2020-06-25 16:51:23 +00:00
package command
import (
"archive/tar"
"compress/gzip"
"context"
2020-08-11 17:14:28 +00:00
"crypto/tls"
2020-06-25 16:51:23 +00:00
"encoding/json"
"fmt"
"html/template"
"io"
"io/ioutil"
"net/http"
"os"
"os/signal"
"path/filepath"
2020-08-11 17:14:28 +00:00
"strconv"
2020-06-25 16:51:23 +00:00
"strings"
"syscall"
"time"
2020-08-11 17:14:28 +00:00
"github.com/hashicorp/go-cleanhttp"
2020-06-25 16:51:23 +00:00
"github.com/hashicorp/nomad/api"
2020-12-01 17:36:05 +00:00
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
2020-06-25 16:51:23 +00:00
"github.com/posener/complete"
)
2020-08-11 19:39:44 +00:00
type OperatorDebugCommand struct {
2020-06-25 16:51:23 +00:00
Meta
2020-12-01 17:36:05 +00:00
timestamp string
collectDir string
duration time . Duration
interval time . Duration
pprofDuration time . Duration
logLevel string
stale bool
maxNodes int
nodeClass string
nodeIDs [ ] string
serverIDs [ ] string
consul * external
vault * external
manifest [ ] string
ctx context . Context
cancel context . CancelFunc
2020-06-25 16:51:23 +00:00
}
const (
2020-08-11 19:39:44 +00:00
userAgent = "nomad operator debug"
2020-06-25 16:51:23 +00:00
)
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Help ( ) string {
2020-06-25 16:51:23 +00:00
helpText := `
2020-08-11 19:39:44 +00:00
Usage : nomad operator debug [ options ]
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Build an archive containing Nomad cluster configuration and state , and Consul
and Vault status . Include logs and pprof profiles for selected servers and
client nodes .
2020-06-25 16:51:23 +00:00
2020-11-19 21:38:08 +00:00
If ACLs are enabled , this command will require a token with the ' node : read '
capability to run . In order to collect information , the token will also
require the ' agent : read ' and ' operator : read ' capabilities , as well as the
2020-12-01 17:36:05 +00:00
' list - jobs ' capability for all namespaces . To collect pprof profiles the
2021-03-09 13:31:38 +00:00
token will also require ' agent : write ' , or enable_debug configuration set to
true .
2020-11-19 21:38:08 +00:00
2020-06-25 16:51:23 +00:00
General Options :
2020-11-19 16:15:23 +00:00
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Consul Options :
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
- consul - http - addr = < addr >
2021-03-09 13:31:38 +00:00
The address and port of the Consul HTTP agent . Overrides the
CONSUL_HTTP_ADDR environment variable .
2020-08-11 17:14:28 +00:00
- consul - token = < token >
Token used to query Consul . Overrides the CONSUL_HTTP_TOKEN environment
variable and the Consul token file .
- consul - token - file = < path >
Path to the Consul token file . Overrides the CONSUL_HTTP_TOKEN_FILE
environment variable .
- consul - client - cert = < path >
Path to the Consul client cert file . Overrides the CONSUL_CLIENT_CERT
environment variable .
- consul - client - key = < path >
Path to the Consul client key file . Overrides the CONSUL_CLIENT_KEY
environment variable .
- consul - ca - cert = < path >
Path to a CA file to use with Consul . Overrides the CONSUL_CACERT
environment variable and the Consul CA path .
- consul - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Consul
certificate . Overrides the CONSUL_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Vault Options :
2020-08-11 17:14:28 +00:00
- vault - address = < addr >
The address and port of the Vault HTTP agent . Overrides the VAULT_ADDR
environment variable .
- vault - token = < token >
Token used to query Vault . Overrides the VAULT_TOKEN environment
variable .
- vault - client - cert = < path >
Path to the Vault client cert file . Overrides the VAULT_CLIENT_CERT
environment variable .
- vault - client - key = < path >
Path to the Vault client key file . Overrides the VAULT_CLIENT_KEY
environment variable .
- vault - ca - cert = < path >
Path to a CA file to use with Vault . Overrides the VAULT_CACERT
environment variable and the Vault CA path .
- vault - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Vault
certificate . Overrides the VAULT_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Debug Options :
- duration = < duration >
The duration of the log monitor command . Defaults to 2 m .
- interval = < interval >
The interval between snapshots of the Nomad state . Set interval equal to
duration to capture a single snapshot . Defaults to 30 s .
- log - level = < level >
The log level to monitor . Defaults to DEBUG .
- max - nodes = < count >
Cap the maximum number of client nodes included in the capture . Defaults
to 10 , set to 0 for unlimited .
- node - id = < node > , < node >
Comma separated list of Nomad client node ids to monitor for logs , API
outputs , and pprof profiles . Accepts id prefixes , and "all" to select all
nodes ( up to count = max - nodes ) . Defaults to "all" .
- node - class = < node - class >
Filter client nodes based on node class .
- pprof - duration = < duration >
Duration for pprof collection . Defaults to 1 s .
- server - id = < server > , < server >
Comma separated list of Nomad server names to monitor for logs , API
outputs , and pprof profiles . Accepts server names , "leader" , or "all" .
Defaults to "all" .
- stale = < true | false >
If "false" , the default , get membership data from the cluster leader . If
the cluster is in an outage unable to establish leadership , it may be
necessary to get the configuration from a non - leader server .
- output = < path >
Path to the parent directory of the output directory . If not specified , an
archive is built in the current directory .
2020-06-25 16:51:23 +00:00
`
return strings . TrimSpace ( helpText )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Synopsis ( ) string {
2020-06-25 16:51:23 +00:00
return "Build a debug archive"
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteFlags ( ) complete . Flags {
2020-06-25 16:51:23 +00:00
return mergeAutocompleteFlags ( c . Meta . AutocompleteFlags ( FlagSetClient ) ,
complete . Flags {
2020-12-01 17:36:05 +00:00
"-duration" : complete . PredictAnything ,
"-interval" : complete . PredictAnything ,
"-log-level" : complete . PredictAnything ,
"-max-nodes" : complete . PredictAnything ,
"-node-class" : complete . PredictAnything ,
"-node-id" : complete . PredictAnything ,
"-server-id" : complete . PredictAnything ,
"-output" : complete . PredictAnything ,
"-pprof-duration" : complete . PredictAnything ,
"-consul-token" : complete . PredictAnything ,
"-vault-token" : complete . PredictAnything ,
2020-06-25 16:51:23 +00:00
} )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteArgs ( ) complete . Predictor {
2020-06-25 16:51:23 +00:00
return complete . PredictNothing
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Name ( ) string { return "debug" }
2020-06-25 16:51:23 +00:00
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Run ( args [ ] string ) int {
2020-06-25 16:51:23 +00:00
flags := c . Meta . FlagSet ( c . Name ( ) , FlagSetClient )
flags . Usage = func ( ) { c . Ui . Output ( c . Help ( ) ) }
2020-12-01 17:36:05 +00:00
var duration , interval , output , pprofDuration string
2020-06-25 16:51:23 +00:00
var nodeIDs , serverIDs string
flags . StringVar ( & duration , "duration" , "2m" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & interval , "interval" , "30s" , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & c . logLevel , "log-level" , "DEBUG" , "" )
2020-11-12 16:25:28 +00:00
flags . IntVar ( & c . maxNodes , "max-nodes" , 10 , "" )
flags . StringVar ( & c . nodeClass , "node-class" , "" , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & nodeIDs , "node-id" , "" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & serverIDs , "server-id" , "all" , "" )
2020-08-11 17:14:28 +00:00
flags . BoolVar ( & c . stale , "stale" , false , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & output , "output" , "" , "" )
2020-12-01 17:36:05 +00:00
flags . StringVar ( & pprofDuration , "pprof-duration" , "1s" , "" )
2020-08-11 17:14:28 +00:00
c . consul = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . consul . addrVal , "consul-http-addr" , os . Getenv ( "CONSUL_HTTP_ADDR" ) , "" )
ssl := os . Getenv ( "CONSUL_HTTP_SSL" )
c . consul . ssl , _ = strconv . ParseBool ( ssl )
flags . StringVar ( & c . consul . auth , "consul-auth" , os . Getenv ( "CONSUL_HTTP_AUTH" ) , "" )
flags . StringVar ( & c . consul . tokenVal , "consul-token" , os . Getenv ( "CONSUL_HTTP_TOKEN" ) , "" )
flags . StringVar ( & c . consul . tokenFile , "consul-token-file" , os . Getenv ( "CONSUL_HTTP_TOKEN_FILE" ) , "" )
flags . StringVar ( & c . consul . tls . ClientCert , "consul-client-cert" , os . Getenv ( "CONSUL_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . consul . tls . ClientKey , "consul-client-key" , os . Getenv ( "CONSUL_CLIENT_KEY" ) , "" )
flags . StringVar ( & c . consul . tls . CACert , "consul-ca-cert" , os . Getenv ( "CONSUL_CACERT" ) , "" )
flags . StringVar ( & c . consul . tls . CAPath , "consul-ca-path" , os . Getenv ( "CONSUL_CAPATH" ) , "" )
c . vault = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . vault . addrVal , "vault-address" , os . Getenv ( "VAULT_ADDR" ) , "" )
flags . StringVar ( & c . vault . tokenVal , "vault-token" , os . Getenv ( "VAULT_TOKEN" ) , "" )
flags . StringVar ( & c . vault . tls . CACert , "vault-ca-cert" , os . Getenv ( "VAULT_CACERT" ) , "" )
flags . StringVar ( & c . vault . tls . CAPath , "vault-ca-path" , os . Getenv ( "VAULT_CAPATH" ) , "" )
flags . StringVar ( & c . vault . tls . ClientCert , "vault-client-cert" , os . Getenv ( "VAULT_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . vault . tls . ClientKey , "vault-client-key" , os . Getenv ( "VAULT_CLIENT_KEY" ) , "" )
2020-06-25 16:51:23 +00:00
if err := flags . Parse ( args ) ; err != nil {
2020-10-06 02:30:01 +00:00
c . Ui . Error ( fmt . Sprintf ( "Error parsing arguments: %q" , err ) )
2020-06-25 16:51:23 +00:00
return 1
}
2020-11-12 16:25:28 +00:00
// Parse the capture duration
2020-06-25 16:51:23 +00:00
d , err := time . ParseDuration ( duration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing duration: %s: %s" , duration , err . Error ( ) ) )
return 1
}
c . duration = d
2020-11-12 16:25:28 +00:00
// Parse the capture interval
2020-06-25 16:51:23 +00:00
i , err := time . ParseDuration ( interval )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s: %s" , interval , err . Error ( ) ) )
return 1
}
c . interval = i
2021-03-09 13:31:38 +00:00
// Validate interval
if i . Seconds ( ) > d . Seconds ( ) {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s is greater than duration %s" , interval , duration ) )
return 1
}
2020-12-01 17:36:05 +00:00
// Parse the pprof capture duration
pd , err := time . ParseDuration ( pprofDuration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing pprof duration: %s: %s" , pprofDuration , err . Error ( ) ) )
return 1
}
c . pprofDuration = pd
2020-11-12 16:25:28 +00:00
// Verify there are no extra arguments
2020-06-25 16:51:23 +00:00
args = flags . Args ( )
if l := len ( args ) ; l != 0 {
c . Ui . Error ( "This command takes no arguments" )
c . Ui . Error ( commandErrorText ( c ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Initialize capture variables and structs
c . manifest = make ( [ ] string , 0 )
ctx , cancel := context . WithCancel ( context . Background ( ) )
c . ctx = ctx
c . cancel = cancel
c . trap ( )
// Generate timestamped file name
format := "2006-01-02-150405Z"
c . timestamp = time . Now ( ) . UTC ( ) . Format ( format )
stamped := "nomad-debug-" + c . timestamp
// Create the output directory
var tmp string
if output != "" {
// User specified output directory
tmp = filepath . Join ( output , stamped )
_ , err := os . Stat ( tmp )
if ! os . IsNotExist ( err ) {
c . Ui . Error ( "Output directory already exists" )
return 2
}
} else {
// Generate temp directory
tmp , err = ioutil . TempDir ( os . TempDir ( ) , stamped )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating tmp directory: %s" , err . Error ( ) ) )
return 2
}
defer os . RemoveAll ( tmp )
}
c . collectDir = tmp
// Create an instance of the API client
2020-06-25 16:51:23 +00:00
client , err := c . Meta . Client ( )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error initializing client: %s" , err . Error ( ) ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Search all nodes If a node class is specified without a list of node id prefixes
if c . nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
}
// Resolve client node id prefixes
nodesFound := 0
nodeLookupFailCount := 0
nodeCaptureCount := 0
2020-06-25 16:51:23 +00:00
for _ , id := range argNodes ( nodeIDs ) {
2020-11-12 16:25:28 +00:00
if id == "all" {
// Capture from all nodes using empty prefix filter
id = ""
} else {
// Capture from nodes starting with prefix id
id = sanitizeUUIDPrefix ( id )
}
2020-06-25 16:51:23 +00:00
nodes , _ , err := client . Nodes ( ) . PrefixList ( id )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error querying node info: %s" , err ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Increment fail count if no nodes are found
2020-12-14 20:02:48 +00:00
if len ( nodes ) == 0 {
2020-06-25 16:51:23 +00:00
c . Ui . Error ( fmt . Sprintf ( "No node(s) with prefix %q found" , id ) )
2020-11-12 16:25:28 +00:00
nodeLookupFailCount ++
continue
2020-06-25 16:51:23 +00:00
}
2020-12-14 20:02:48 +00:00
nodesFound += len ( nodes )
2020-11-12 16:25:28 +00:00
// Apply constraints to nodes found
2020-06-25 16:51:23 +00:00
for _ , n := range nodes {
2020-11-12 16:25:28 +00:00
// Ignore nodes that do not match specified class
if c . nodeClass != "" && n . NodeClass != c . nodeClass {
continue
}
// Add node to capture list
2020-06-25 16:51:23 +00:00
c . nodeIDs = append ( c . nodeIDs , n . ID )
2020-11-12 16:25:28 +00:00
nodeCaptureCount ++
// Stop looping when we reach the max
if c . maxNodes != 0 && nodeCaptureCount >= c . maxNodes {
break
}
2020-06-25 16:51:23 +00:00
}
}
2020-11-12 16:25:28 +00:00
// Return error if nodes were specified but none were found
if len ( nodeIDs ) > 0 && nodeCaptureCount == 0 {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve clients, 0 nodes found in list: %s" , nodeIDs ) )
return 1
}
2020-10-14 19:16:10 +00:00
// Resolve servers
members , err := client . Agent ( ) . Members ( )
2020-11-12 16:25:28 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve server list; err: %v" , err ) )
return 1
}
2020-10-14 19:16:10 +00:00
c . writeJSON ( "version" , "members.json" , members , err )
// We always write the error to the file, but don't range if no members found
if serverIDs == "all" && members != nil {
// Special case to capture from all servers
for _ , member := range members . Members {
c . serverIDs = append ( c . serverIDs , member . Name )
}
} else {
2020-12-09 19:05:18 +00:00
c . serverIDs = append ( c . serverIDs , argNodes ( serverIDs ) ... )
2020-10-14 19:16:10 +00:00
}
2020-11-12 16:25:28 +00:00
serversFound := 0
serverCaptureCount := 0
if members != nil {
serversFound = len ( members . Members )
}
if c . serverIDs != nil {
serverCaptureCount = len ( c . serverIDs )
}
2020-10-14 19:16:10 +00:00
// Return error if servers were specified but not found
2020-11-12 16:25:28 +00:00
if len ( serverIDs ) > 0 && serverCaptureCount == 0 {
2020-10-14 19:16:10 +00:00
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve servers, 0 members found in list: %s" , serverIDs ) )
return 1
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
// Display general info about the capture
c . Ui . Output ( "Starting debugger..." )
c . Ui . Output ( "" )
c . Ui . Output ( fmt . Sprintf ( " Servers: (%d/%d) %v" , serverCaptureCount , serversFound , c . serverIDs ) )
c . Ui . Output ( fmt . Sprintf ( " Clients: (%d/%d) %v" , nodeCaptureCount , nodesFound , c . nodeIDs ) )
if nodeCaptureCount > 0 && nodeCaptureCount == c . maxNodes {
c . Ui . Output ( fmt . Sprintf ( " Max node count reached (%d)" , c . maxNodes ) )
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
if nodeLookupFailCount > 0 {
c . Ui . Output ( fmt . Sprintf ( "Client fail count: %v" , nodeLookupFailCount ) )
}
if c . nodeClass != "" {
c . Ui . Output ( fmt . Sprintf ( " Node Class: %s" , c . nodeClass ) )
}
c . Ui . Output ( fmt . Sprintf ( " Interval: %s" , interval ) )
c . Ui . Output ( fmt . Sprintf ( " Duration: %s" , duration ) )
2020-12-01 17:36:05 +00:00
if c . pprofDuration . Seconds ( ) != 1 {
c . Ui . Output ( fmt . Sprintf ( " pprof Duration: %s" , c . pprofDuration ) )
}
2020-11-12 16:25:28 +00:00
c . Ui . Output ( "" )
c . Ui . Output ( "Capturing cluster data..." )
2020-06-25 16:51:23 +00:00
2020-11-12 16:25:28 +00:00
// Start collecting data
2020-06-25 16:51:23 +00:00
err = c . collect ( client )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error collecting data: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Write index json/html manifest files
2020-06-25 16:51:23 +00:00
c . writeManifest ( )
2020-11-12 16:25:28 +00:00
// Exit before archive if output directory was specified
2020-06-25 16:51:23 +00:00
if output != "" {
c . Ui . Output ( fmt . Sprintf ( "Created debug directory: %s" , c . collectDir ) )
return 0
}
2020-11-12 16:25:28 +00:00
// Create archive tarball
2020-06-25 16:51:23 +00:00
archiveFile := stamped + ".tar.gz"
2020-08-11 17:14:28 +00:00
err = TarCZF ( archiveFile , tmp , stamped )
2020-06-25 16:51:23 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating archive: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Final output with name of tarball
2020-06-25 16:51:23 +00:00
c . Ui . Output ( fmt . Sprintf ( "Created debug archive: %s" , archiveFile ) )
return 0
}
// collect collects data from our endpoints and writes the archive bundle
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collect ( client * api . Client ) error {
2020-06-25 16:51:23 +00:00
// Version contains cluster meta information
dir := "version"
self , err := client . Agent ( ) . Self ( )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "agent-self.json" , self , err )
2020-06-25 16:51:23 +00:00
// Fetch data directly from consul and vault. Ignore errors
var consul , vault string
2020-08-11 17:14:28 +00:00
if self != nil {
r , ok := self . Config [ "Consul" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "Addr" ]
consul , _ = raw . ( string )
raw = m [ "EnableSSL" ]
ssl , _ := raw . ( bool )
if ssl {
consul = "https://" + consul
} else {
consul = "http://" + consul
}
}
2020-06-25 16:51:23 +00:00
}
2020-08-11 17:14:28 +00:00
r , ok = self . Config [ "Vault" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "Addr" ]
vault , _ = raw . ( string )
}
}
2020-06-25 16:51:23 +00:00
}
c . collectConsul ( dir , consul )
c . collectVault ( dir , vault )
2020-08-11 17:14:28 +00:00
c . collectAgentHosts ( client )
c . collectPprofs ( client )
2020-06-25 16:51:23 +00:00
c . startMonitors ( client )
c . collectPeriodic ( client )
return nil
}
// path returns platform specific paths in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) path ( paths ... string ) string {
2020-06-25 16:51:23 +00:00
ps := [ ] string { c . collectDir }
ps = append ( ps , paths ... )
return filepath . Join ( ps ... )
}
// mkdir creates directories in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) mkdir ( paths ... string ) error {
2020-12-01 17:36:05 +00:00
joinedPath := c . path ( paths ... )
// Ensure path doesn't escape the sandbox of the capture directory
escapes := helper . PathEscapesSandbox ( c . collectDir , joinedPath )
if escapes {
return fmt . Errorf ( "file path escapes capture directory" )
}
return os . MkdirAll ( joinedPath , 0755 )
2020-06-25 16:51:23 +00:00
}
// startMonitors starts go routines for each node and client
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitors ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
for _ , id := range c . nodeIDs {
go c . startMonitor ( "client" , "node_id" , id , client )
}
for _ , id := range c . serverIDs {
go c . startMonitor ( "server" , "server_id" , id , client )
}
}
// startMonitor starts one monitor api request, writing to a file. It blocks and should be
// called in a go routine. Errors are ignored, we want to build the archive even if a node
// is unavailable
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitor ( path , idKey , nodeID string , client * api . Client ) {
2020-06-25 16:51:23 +00:00
c . mkdir ( path , nodeID )
fh , err := os . Create ( c . path ( path , nodeID , "monitor.log" ) )
if err != nil {
return
}
defer fh . Close ( )
qo := api . QueryOptions {
Params : map [ string ] string {
idKey : nodeID ,
"log_level" : c . logLevel ,
} ,
}
outCh , errCh := client . Agent ( ) . Monitor ( c . ctx . Done ( ) , & qo )
for {
select {
case out := <- outCh :
if out == nil {
continue
}
fh . Write ( out . Data )
case err := <- errCh :
fh . WriteString ( fmt . Sprintf ( "monitor: %s\n" , err . Error ( ) ) )
return
case <- c . ctx . Done ( ) :
return
}
}
}
2020-07-02 13:51:25 +00:00
// collectAgentHosts calls collectAgentHost for each selected node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHosts ( client * api . Client ) {
2020-07-02 13:51:25 +00:00
for _ , n := range c . nodeIDs {
c . collectAgentHost ( "client" , n , client )
}
for _ , n := range c . serverIDs {
c . collectAgentHost ( "server" , n , client )
}
}
// collectAgentHost gets the agent host data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHost ( path , id string , client * api . Client ) {
2020-07-02 13:51:25 +00:00
var host * api . HostDataResponse
var err error
if path == "server" {
host , err = client . Agent ( ) . Host ( id , "" , nil )
} else {
host , err = client . Agent ( ) . Host ( "" , id , nil )
}
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s/%s: Failed to retrieve agent host data, err: %v" , path , id , err ) )
2020-07-02 13:51:25 +00:00
2020-12-01 17:36:05 +00:00
if strings . Contains ( err . Error ( ) , structs . ErrPermissionDenied . Error ( ) ) {
// Drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information." )
2020-12-01 17:36:05 +00:00
}
return // exit on any error
}
path = filepath . Join ( path , id )
2020-08-11 17:14:28 +00:00
c . writeJSON ( path , "agent-host.json" , host , err )
2020-07-02 13:51:25 +00:00
}
2020-06-25 16:51:23 +00:00
// collectPprofs captures the /agent/pprof for each listed node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPprofs ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
for _ , n := range c . nodeIDs {
c . collectPprof ( "client" , n , client )
}
for _ , n := range c . serverIDs {
c . collectPprof ( "server" , n , client )
}
}
// collectPprof captures pprof data for the node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPprof ( path , id string , client * api . Client ) {
2020-12-01 17:36:05 +00:00
pprofDurationSeconds := int ( c . pprofDuration . Seconds ( ) )
opts := api . PprofOptions { Seconds : pprofDurationSeconds }
2020-06-25 16:51:23 +00:00
if path == "server" {
opts . ServerID = id
} else {
opts . NodeID = id
}
path = filepath . Join ( path , id )
bs , err := client . Agent ( ) . CPUProfile ( opts , nil )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof profile.prof, err: %v" , path , err ) )
if structs . IsErrPermissionDenied ( err ) {
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
// But lets first drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information." )
2020-12-01 17:36:05 +00:00
return // only exit on 403
}
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "profile.prof" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
bs , err = client . Agent ( ) . Trace ( opts , nil )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof trace.prof, err: %v" , path , err ) )
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "trace.prof" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
bs , err = client . Agent ( ) . Lookup ( "goroutine" , opts , nil )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof goroutine.prof, err: %v" , path , err ) )
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "goroutine.prof" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
2020-10-14 19:16:10 +00:00
// Gather goroutine text output - debug type 1
// debug type 1 writes the legacy text format for human readable output
opts . Debug = 1
bs , err = client . Agent ( ) . Lookup ( "goroutine" , opts , nil )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v" , path , err ) )
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "goroutine-debug1.txt" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-10-14 19:16:10 +00:00
}
// Gather goroutine text output - debug type 2
// When printing the "goroutine" profile, debug=2 means to print the goroutine
// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
opts . Debug = 2
bs , err = client . Agent ( ) . Lookup ( "goroutine" , opts , nil )
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v" , path , err ) )
} else {
2020-12-08 20:47:04 +00:00
err := c . writeBytes ( path , "goroutine-debug2.txt" , bs )
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-10-14 19:16:10 +00:00
}
2020-06-25 16:51:23 +00:00
}
// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
// the monitor requests
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPeriodic ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
duration := time . After ( c . duration )
// Set interval to 0 so that we immediately execute, wait the interval next time
interval := time . After ( 0 * time . Second )
var intervalCount int
2020-08-11 17:14:28 +00:00
var name , dir string
2020-06-25 16:51:23 +00:00
for {
select {
case <- duration :
c . cancel ( )
return
case <- interval :
2020-08-11 17:14:28 +00:00
name = fmt . Sprintf ( "%04d" , intervalCount )
dir = filepath . Join ( "nomad" , name )
c . Ui . Output ( fmt . Sprintf ( " Capture interval %s" , name ) )
2020-06-25 16:51:23 +00:00
c . collectNomad ( dir , client )
2020-08-11 17:14:28 +00:00
c . collectOperator ( dir , client )
2020-06-25 16:51:23 +00:00
interval = time . After ( c . interval )
2020-12-01 17:36:05 +00:00
intervalCount ++
2020-06-25 16:51:23 +00:00
case <- c . ctx . Done ( ) :
return
}
}
}
2020-08-11 17:14:28 +00:00
// collectOperator captures some cluster meta information
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectOperator ( dir string , client * api . Client ) {
2020-08-11 17:14:28 +00:00
rc , err := client . Operator ( ) . RaftGetConfiguration ( nil )
c . writeJSON ( dir , "operator-raft.json" , rc , err )
sc , _ , err := client . Operator ( ) . SchedulerGetConfiguration ( nil )
c . writeJSON ( dir , "operator-scheduler.json" , sc , err )
ah , _ , err := client . Operator ( ) . AutopilotServerHealth ( nil )
c . writeJSON ( dir , "operator-autopilot-health.json" , ah , err )
2020-08-31 17:22:23 +00:00
lic , _ , err := client . Operator ( ) . LicenseGet ( nil )
c . writeJSON ( dir , "license.json" , lic , err )
2020-08-11 17:14:28 +00:00
}
2020-06-25 16:51:23 +00:00
// collectNomad captures the nomad cluster state
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectNomad ( dir string , client * api . Client ) error {
2020-06-25 16:51:23 +00:00
var qo * api . QueryOptions
js , _ , err := client . Jobs ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "jobs.json" , js , err )
2020-06-25 16:51:23 +00:00
ds , _ , err := client . Deployments ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "deployments.json" , ds , err )
2020-06-25 16:51:23 +00:00
es , _ , err := client . Evaluations ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "evaluations.json" , es , err )
2020-06-25 16:51:23 +00:00
as , _ , err := client . Allocations ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "allocations.json" , as , err )
2020-06-25 16:51:23 +00:00
ns , _ , err := client . Nodes ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "nodes.json" , ns , err )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugins - /v1/plugins?type=csi
2020-06-25 16:51:23 +00:00
ps , _ , err := client . CSIPlugins ( ) . List ( qo )
2020-08-11 17:14:28 +00:00
c . writeJSON ( dir , "plugins.json" , ps , err )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugin details - /v1/plugin/csi/:plugin_id
for _ , p := range ps {
csiPlugin , _ , err := client . CSIPlugins ( ) . Info ( p . ID , qo )
csiPluginFileName := fmt . Sprintf ( "csi-plugin-id-%s.json" , p . ID )
c . writeJSON ( dir , csiPluginFileName , csiPlugin , err )
}
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Volumes - /v1/volumes?type=csi
csiVolumes , _ , err := client . CSIVolumes ( ) . List ( qo )
c . writeJSON ( dir , "csi-volumes.json" , csiVolumes , err )
// CSI Volume details - /v1/volumes/csi/:volume-id
for _ , v := range csiVolumes {
csiVolume , _ , err := client . CSIVolumes ( ) . Info ( v . ID , qo )
csiFileName := fmt . Sprintf ( "csi-volume-id-%s.json" , v . ID )
c . writeJSON ( dir , csiFileName , csiVolume , err )
2020-10-14 19:16:10 +00:00
}
2020-10-06 02:30:01 +00:00
2020-12-01 17:36:05 +00:00
metrics , _ , err := client . Operator ( ) . MetricsSummary ( qo )
c . writeJSON ( dir , "metrics.json" , metrics , err )
2020-06-25 16:51:23 +00:00
return nil
}
// collectConsul calls the Consul API directly to collect data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectConsul ( dir , consul string ) error {
2020-08-11 17:14:28 +00:00
addr := c . consul . addr ( consul )
if addr == "" {
2020-06-25 16:51:23 +00:00
return nil
}
2020-08-11 17:14:28 +00:00
client := defaultHttpClient ( )
api . ConfigureTLS ( client , c . consul . tls )
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req , _ := http . NewRequest ( "GET" , addr + "/v1/agent/self" , nil )
req . Header . Add ( "X-Consul-Token" , c . consul . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
resp , err := client . Do ( req )
c . writeBody ( dir , "consul-agent-self.json" , resp , err )
2020-08-11 17:14:28 +00:00
req , _ = http . NewRequest ( "GET" , addr + "/v1/agent/members" , nil )
req . Header . Add ( "X-Consul-Token" , c . consul . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
resp , err = client . Do ( req )
c . writeBody ( dir , "consul-agent-members.json" , resp , err )
return nil
}
// collectVault calls the Vault API directly to collect data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectVault ( dir , vault string ) error {
2020-08-11 17:14:28 +00:00
addr := c . vault . addr ( vault )
if addr == "" {
2020-06-25 16:51:23 +00:00
return nil
}
2020-08-11 17:14:28 +00:00
client := defaultHttpClient ( )
api . ConfigureTLS ( client , c . vault . tls )
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req , _ := http . NewRequest ( "GET" , addr + "/sys/health" , nil )
req . Header . Add ( "X-Vault-Token" , c . vault . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
resp , err := client . Do ( req )
c . writeBody ( dir , "vault-sys-health.json" , resp , err )
return nil
}
// writeBytes writes a file to the archive, recording it in the manifest
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBytes ( dir , file string , data [ ] byte ) error {
2020-12-01 17:36:05 +00:00
// Replace invalid characters in filename
filename := helper . CleanFilename ( file , "_" )
relativePath := filepath . Join ( dir , filename )
2020-10-14 19:16:10 +00:00
c . manifest = append ( c . manifest , relativePath )
dirPath := filepath . Join ( c . collectDir , dir )
2020-12-01 17:36:05 +00:00
filePath := filepath . Join ( dirPath , filename )
2020-10-14 19:16:10 +00:00
// Ensure parent directories exist
err := os . MkdirAll ( dirPath , os . ModePerm )
if err != nil {
2020-12-01 17:36:05 +00:00
return fmt . Errorf ( "failed to create parent directories of \"%s\": %w" , dirPath , err )
}
// Ensure filename doesn't escape the sandbox of the capture directory
escapes := helper . PathEscapesSandbox ( c . collectDir , filePath )
if escapes {
return fmt . Errorf ( "file path \"%s\" escapes capture directory \"%s\"" , filePath , c . collectDir )
2020-10-14 19:16:10 +00:00
}
2020-06-25 16:51:23 +00:00
2020-10-14 19:16:10 +00:00
// Create the file
fh , err := os . Create ( filePath )
2020-06-25 16:51:23 +00:00
if err != nil {
2020-12-01 17:36:05 +00:00
return fmt . Errorf ( "failed to create file \"%s\", err: %w" , filePath , err )
2020-06-25 16:51:23 +00:00
}
defer fh . Close ( )
_ , err = fh . Write ( data )
2020-12-01 17:36:05 +00:00
if err != nil {
return fmt . Errorf ( "Failed to write data to file \"%s\", err: %w" , filePath , err )
}
return nil
2020-06-25 16:51:23 +00:00
}
// writeJSON writes JSON responses from the Nomad API calls to the archive
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeJSON ( dir , file string , data interface { } , err error ) error {
2020-08-11 17:14:28 +00:00
if err != nil {
return c . writeError ( dir , file , err )
}
2020-06-25 16:51:23 +00:00
bytes , err := json . Marshal ( data )
2020-08-11 17:14:28 +00:00
if err != nil {
return c . writeError ( dir , file , err )
}
2020-12-01 17:36:05 +00:00
err = c . writeBytes ( dir , file , bytes )
if err != nil {
c . Ui . Error ( err . Error ( ) )
}
return nil
2020-08-11 17:14:28 +00:00
}
// writeError writes a JSON error object to capture errors in the debug bundle without
// reporting
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeError ( dir , file string , err error ) error {
2020-08-11 17:14:28 +00:00
bytes , err := json . Marshal ( errorWrapper { Error : err . Error ( ) } )
2020-06-25 16:51:23 +00:00
if err != nil {
return err
}
return c . writeBytes ( dir , file , bytes )
}
2020-08-11 17:14:28 +00:00
type errorWrapper struct {
Error string
}
2020-06-25 16:51:23 +00:00
// writeBody is a helper that writes the body of an http.Response to the archive
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBody ( dir , file string , resp * http . Response , err error ) {
2020-06-25 16:51:23 +00:00
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-06-25 16:51:23 +00:00
return
}
if resp . ContentLength == 0 {
return
}
2020-08-11 17:14:28 +00:00
defer resp . Body . Close ( )
2020-06-25 16:51:23 +00:00
body , err := ioutil . ReadAll ( resp . Body )
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-12-01 17:36:05 +00:00
return
2020-06-25 16:51:23 +00:00
}
2020-12-01 17:36:05 +00:00
if err := c . writeBytes ( dir , file , body ) ; err != nil {
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
// writeManifest creates the index files
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeManifest ( ) error {
2020-06-25 16:51:23 +00:00
// Write the JSON
path := filepath . Join ( c . collectDir , "index.json" )
jsonFh , err := os . Create ( path )
if err != nil {
return err
}
defer jsonFh . Close ( )
json . NewEncoder ( jsonFh ) . Encode ( c . manifest )
// Write the HTML
path = filepath . Join ( c . collectDir , "index.html" )
htmlFh , err := os . Create ( path )
if err != nil {
return err
}
defer htmlFh . Close ( )
head , _ := template . New ( "head" ) . Parse ( "<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>" )
line , _ := template . New ( "line" ) . Parse ( "<li><a href=\"{{.}}\">{{.}}</a></li>\n" )
if err != nil {
return fmt . Errorf ( "%v" , err )
}
tail := "</ul></body></html>\n"
head . Execute ( htmlFh , c . timestamp )
for _ , f := range c . manifest {
line . Execute ( htmlFh , f )
}
htmlFh . WriteString ( tail )
return nil
}
// trap captures signals, and closes stopCh
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) trap ( ) {
2020-06-25 16:51:23 +00:00
sigCh := make ( chan os . Signal , 1 )
signal . Notify ( sigCh ,
syscall . SIGHUP ,
syscall . SIGINT ,
syscall . SIGTERM ,
syscall . SIGQUIT )
go func ( ) {
<- sigCh
c . cancel ( )
} ( )
}
// TarCZF, like the tar command, recursively builds a gzip compressed tar archive from a
2020-08-11 17:14:28 +00:00
// directory. If not empty, all files in the bundle are prefixed with the target path
func TarCZF ( archive string , src , target string ) error {
2020-06-25 16:51:23 +00:00
// ensure the src actually exists before trying to tar it
if _ , err := os . Stat ( src ) ; err != nil {
return fmt . Errorf ( "Unable to tar files - %v" , err . Error ( ) )
}
// create the archive
fh , err := os . Create ( archive )
if err != nil {
return err
}
defer fh . Close ( )
zz := gzip . NewWriter ( fh )
defer zz . Close ( )
tw := tar . NewWriter ( zz )
defer tw . Close ( )
// tar
return filepath . Walk ( src , func ( file string , fi os . FileInfo , err error ) error {
// return on any error
if err != nil {
return err
}
if ! fi . Mode ( ) . IsRegular ( ) {
return nil
}
header , err := tar . FileInfoHeader ( fi , fi . Name ( ) )
if err != nil {
return err
}
// remove leading path to the src, so files are relative to the archive
2020-08-11 17:14:28 +00:00
path := strings . Replace ( file , src , "" , - 1 )
if target != "" {
path = filepath . Join ( [ ] string { target , path } ... )
}
path = strings . TrimPrefix ( path , string ( filepath . Separator ) )
header . Name = path
2020-06-25 16:51:23 +00:00
if err := tw . WriteHeader ( header ) ; err != nil {
return err
}
// copy the file contents
f , err := os . Open ( file )
if err != nil {
return err
}
if _ , err := io . Copy ( tw , f ) ; err != nil {
return err
}
f . Close ( )
return nil
} )
}
// argNodes splits node ids from the command line by ","
func argNodes ( input string ) [ ] string {
ns := strings . Split ( input , "," )
var out [ ] string
for _ , n := range ns {
s := strings . TrimSpace ( n )
if s == "" {
continue
}
out = append ( out , s )
}
return out
}
2020-08-11 17:14:28 +00:00
// external holds address configuration for Consul and Vault APIs
type external struct {
tls * api . TLSConfig
addrVal string
auth string
ssl bool
tokenVal string
tokenFile string
}
func ( e * external ) addr ( defaultAddr string ) string {
if e . addrVal == "" {
return defaultAddr
}
if ! e . ssl {
if strings . HasPrefix ( e . addrVal , "http:" ) {
return e . addrVal
}
2021-02-25 13:22:44 +00:00
if strings . HasPrefix ( e . addrVal , "https:" ) {
// Mismatch: e.ssl=false but addrVal is https
return strings . ReplaceAll ( e . addrVal , "https://" , "http://" )
}
2020-08-11 17:14:28 +00:00
return "http://" + e . addrVal
}
if strings . HasPrefix ( e . addrVal , "https:" ) {
return e . addrVal
}
if strings . HasPrefix ( e . addrVal , "http:" ) {
2021-02-25 13:22:44 +00:00
// Mismatch: e.ssl=true but addrVal is http
return strings . ReplaceAll ( e . addrVal , "http://" , "https://" )
2020-08-11 17:14:28 +00:00
}
return "https://" + e . addrVal
}
func ( e * external ) token ( ) string {
if e . tokenVal != "" {
return e . tokenVal
}
if e . tokenFile != "" {
bs , err := ioutil . ReadFile ( e . tokenFile )
if err == nil {
return strings . TrimSpace ( string ( bs ) )
}
}
return ""
}
// defaultHttpClient configures a basic httpClient
func defaultHttpClient ( ) * http . Client {
httpClient := cleanhttp . DefaultClient ( )
transport := httpClient . Transport . ( * http . Transport )
transport . TLSHandshakeTimeout = 10 * time . Second
transport . TLSClientConfig = & tls . Config {
MinVersion : tls . VersionTLS12 ,
}
return httpClient
}