2020-06-25 16:51:23 +00:00
package command
import (
"archive/tar"
"compress/gzip"
"context"
2020-08-11 17:14:28 +00:00
"crypto/tls"
2020-06-25 16:51:23 +00:00
"encoding/json"
2022-01-18 02:35:51 +00:00
"errors"
2021-11-05 23:43:10 +00:00
"flag"
2020-06-25 16:51:23 +00:00
"fmt"
"html/template"
"io"
"net/http"
"os"
"os/signal"
"path/filepath"
2020-08-11 17:14:28 +00:00
"strconv"
2020-06-25 16:51:23 +00:00
"strings"
"syscall"
"time"
2020-08-11 17:14:28 +00:00
"github.com/hashicorp/go-cleanhttp"
2022-01-18 02:35:51 +00:00
"github.com/hashicorp/go-multierror"
2022-04-28 17:18:55 +00:00
goversion "github.com/hashicorp/go-version"
2020-06-25 16:51:23 +00:00
"github.com/hashicorp/nomad/api"
2021-10-13 00:01:54 +00:00
"github.com/hashicorp/nomad/api/contexts"
2020-12-01 17:36:05 +00:00
"github.com/hashicorp/nomad/helper"
2022-08-24 14:56:42 +00:00
"github.com/hashicorp/nomad/helper/escapingfs"
2021-11-05 23:43:10 +00:00
"github.com/hashicorp/nomad/version"
2020-06-25 16:51:23 +00:00
"github.com/posener/complete"
2022-09-21 19:53:25 +00:00
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
2020-06-25 16:51:23 +00:00
)
2020-08-11 19:39:44 +00:00
type OperatorDebugCommand struct {
2020-06-25 16:51:23 +00:00
Meta
2020-12-01 17:36:05 +00:00
timestamp string
collectDir string
duration time . Duration
interval time . Duration
2022-04-04 19:24:12 +00:00
pprofInterval time . Duration
2020-12-01 17:36:05 +00:00
pprofDuration time . Duration
logLevel string
maxNodes int
nodeClass string
nodeIDs [ ] string
serverIDs [ ] string
2022-01-18 02:35:51 +00:00
topics map [ api . Topic ] [ ] string
index uint64
2020-12-01 17:36:05 +00:00
consul * external
vault * external
manifest [ ] string
ctx context . Context
cancel context . CancelFunc
2021-12-15 15:44:03 +00:00
opts * api . QueryOptions
2022-01-18 02:35:51 +00:00
verbose bool
2022-04-28 17:18:55 +00:00
members * api . ServerMembers
nodes [ ] * api . NodeListStub
2020-06-25 16:51:23 +00:00
}
const (
2022-04-28 17:18:55 +00:00
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
2020-06-25 16:51:23 +00:00
)
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Help ( ) string {
2020-06-25 16:51:23 +00:00
helpText := `
2020-08-11 19:39:44 +00:00
Usage : nomad operator debug [ options ]
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Build an archive containing Nomad cluster configuration and state , and Consul
and Vault status . Include logs and pprof profiles for selected servers and
client nodes .
2020-06-25 16:51:23 +00:00
2020-11-19 21:38:08 +00:00
If ACLs are enabled , this command will require a token with the ' node : read '
capability to run . In order to collect information , the token will also
require the ' agent : read ' and ' operator : read ' capabilities , as well as the
2020-12-01 17:36:05 +00:00
' list - jobs ' capability for all namespaces . To collect pprof profiles the
2021-03-09 13:31:38 +00:00
token will also require ' agent : write ' , or enable_debug configuration set to
true .
2020-11-19 21:38:08 +00:00
2022-01-18 02:35:51 +00:00
If event stream capture is enabled , the Job , Allocation , Deployment ,
and Evaluation topics require ' namespace : read - job ' capabilities , the Node
topic requires ' node : read ' . A ' management ' token is required to capture
ACLToken , ACLPolicy , or all all events .
2020-06-25 16:51:23 +00:00
General Options :
2020-11-19 16:15:23 +00:00
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
2020-06-25 16:51:23 +00:00
2021-03-09 13:31:38 +00:00
Consul Options :
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
- consul - http - addr = < addr >
2021-03-09 13:31:38 +00:00
The address and port of the Consul HTTP agent . Overrides the
CONSUL_HTTP_ADDR environment variable .
2020-08-11 17:14:28 +00:00
- consul - token = < token >
Token used to query Consul . Overrides the CONSUL_HTTP_TOKEN environment
variable and the Consul token file .
- consul - token - file = < path >
Path to the Consul token file . Overrides the CONSUL_HTTP_TOKEN_FILE
environment variable .
- consul - client - cert = < path >
Path to the Consul client cert file . Overrides the CONSUL_CLIENT_CERT
environment variable .
- consul - client - key = < path >
Path to the Consul client key file . Overrides the CONSUL_CLIENT_KEY
environment variable .
- consul - ca - cert = < path >
Path to a CA file to use with Consul . Overrides the CONSUL_CACERT
environment variable and the Consul CA path .
- consul - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Consul
certificate . Overrides the CONSUL_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Vault Options :
2020-08-11 17:14:28 +00:00
- vault - address = < addr >
The address and port of the Vault HTTP agent . Overrides the VAULT_ADDR
environment variable .
- vault - token = < token >
Token used to query Vault . Overrides the VAULT_TOKEN environment
variable .
- vault - client - cert = < path >
Path to the Vault client cert file . Overrides the VAULT_CLIENT_CERT
environment variable .
- vault - client - key = < path >
Path to the Vault client key file . Overrides the VAULT_CLIENT_KEY
environment variable .
- vault - ca - cert = < path >
Path to a CA file to use with Vault . Overrides the VAULT_CACERT
environment variable and the Vault CA path .
- vault - ca - path = < path >
Path to a directory of PEM encoded CA cert files to verify the Vault
certificate . Overrides the VAULT_CAPATH environment variable .
2021-03-09 13:31:38 +00:00
Debug Options :
- duration = < duration >
2021-10-27 17:43:56 +00:00
Set the duration of the debug capture . Logs will be captured from specified servers and
2022-01-18 02:35:51 +00:00
nodes at "log-level" . Defaults to 2 m .
- event - index = < index >
Specifies the index to start streaming events from . If the requested index is
no longer in the buffer the stream will start at the next available index .
Defaults to 0.
- event - topic = < Allocation , Evaluation , Job , Node , * > : < filter >
Enable event stream capture , filtered by comma delimited list of topic filters .
Examples :
"all" or "*:*" for all events
"Evaluation" or "Evaluation:*" for all evaluation events
"*:example" for all events related to the job "example"
Defaults to "none" ( disabled ) .
2021-03-09 13:31:38 +00:00
- interval = < interval >
2021-12-15 15:44:03 +00:00
The interval between snapshots of the Nomad state . Set interval equal to
2021-03-09 13:31:38 +00:00
duration to capture a single snapshot . Defaults to 30 s .
- log - level = < level >
The log level to monitor . Defaults to DEBUG .
- max - nodes = < count >
Cap the maximum number of client nodes included in the capture . Defaults
to 10 , set to 0 for unlimited .
2021-10-27 17:43:56 +00:00
- node - id = < node1 > , < node2 >
2021-03-09 13:31:38 +00:00
Comma separated list of Nomad client node ids to monitor for logs , API
outputs , and pprof profiles . Accepts id prefixes , and "all" to select all
nodes ( up to count = max - nodes ) . Defaults to "all" .
- node - class = < node - class >
Filter client nodes based on node class .
- pprof - duration = < duration >
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
Duration for pprof collection . Defaults to 1 s or - duration , whichever is less .
2021-03-09 13:31:38 +00:00
2022-04-04 19:24:12 +00:00
- pprof - interval = < pprof - interval >
The interval between pprof collections . Set interval equal to
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
duration to capture a single snapshot . Defaults to 250 ms or
- pprof - duration , whichever is less .
2022-04-04 19:24:12 +00:00
2021-10-27 17:43:56 +00:00
- server - id = < server1 > , < server2 >
2021-03-09 13:31:38 +00:00
Comma separated list of Nomad server names to monitor for logs , API
outputs , and pprof profiles . Accepts server names , "leader" , or "all" .
Defaults to "all" .
- stale = < true | false >
If "false" , the default , get membership data from the cluster leader . If
the cluster is in an outage unable to establish leadership , it may be
necessary to get the configuration from a non - leader server .
- output = < path >
2021-12-15 15:44:03 +00:00
Path to the parent directory of the output directory . If specified , no
2022-01-18 02:35:51 +00:00
archive is built . Defaults to the current directory .
- verbose
Enable verbose output .
2020-06-25 16:51:23 +00:00
`
return strings . TrimSpace ( helpText )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Synopsis ( ) string {
2020-06-25 16:51:23 +00:00
return "Build a debug archive"
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteFlags ( ) complete . Flags {
2020-06-25 16:51:23 +00:00
return mergeAutocompleteFlags ( c . Meta . AutocompleteFlags ( FlagSetClient ) ,
complete . Flags {
2020-12-01 17:36:05 +00:00
"-duration" : complete . PredictAnything ,
2022-01-18 02:35:51 +00:00
"-event-index" : complete . PredictAnything ,
"-event-topic" : complete . PredictAnything ,
2020-12-01 17:36:05 +00:00
"-interval" : complete . PredictAnything ,
2021-10-13 00:01:54 +00:00
"-log-level" : complete . PredictSet ( "TRACE" , "DEBUG" , "INFO" , "WARN" , "ERROR" ) ,
2020-12-01 17:36:05 +00:00
"-max-nodes" : complete . PredictAnything ,
2021-10-13 00:01:54 +00:00
"-node-class" : NodeClassPredictor ( c . Client ) ,
"-node-id" : NodePredictor ( c . Client ) ,
"-server-id" : ServerPredictor ( c . Client ) ,
"-output" : complete . PredictDirs ( "*" ) ,
2020-12-01 17:36:05 +00:00
"-pprof-duration" : complete . PredictAnything ,
"-consul-token" : complete . PredictAnything ,
"-vault-token" : complete . PredictAnything ,
2022-01-18 02:35:51 +00:00
"-verbose" : complete . PredictAnything ,
2020-06-25 16:51:23 +00:00
} )
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) AutocompleteArgs ( ) complete . Predictor {
2020-06-25 16:51:23 +00:00
return complete . PredictNothing
}
2021-10-13 00:01:54 +00:00
// NodePredictor returns a client node predictor
func NodePredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
resp , _ , err := client . Search ( ) . PrefixSearch (
a . Last , contexts . Nodes , & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
return resp . Matches [ contexts . Nodes ]
} )
}
// NodeClassPredictor returns a client node class predictor
2022-01-18 02:35:51 +00:00
// TODO dmay: Consider API options for node class filtering
2021-10-13 00:01:54 +00:00
func NodeClassPredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
nodes , _ , err := client . Nodes ( ) . List ( & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
// Build map of unique node classes across all nodes
classes := make ( map [ string ] bool )
for _ , node := range nodes {
classes [ node . NodeClass ] = true
}
// Iterate over node classes looking for match
filtered := [ ] string { }
for class := range classes {
if strings . HasPrefix ( class , a . Last ) {
filtered = append ( filtered , class )
}
}
return filtered
} )
}
// ServerPredictor returns a server member predictor
2022-01-18 02:35:51 +00:00
// TODO dmay: Consider API options for server member filtering
2021-10-13 00:01:54 +00:00
func ServerPredictor ( factory ApiClientFactory ) complete . Predictor {
return complete . PredictFunc ( func ( a complete . Args ) [ ] string {
client , err := factory ( )
if err != nil {
return nil
}
2021-12-15 15:44:03 +00:00
// note we can't use the -stale flag here because we're in the
// predictor, but a stale query should be safe for prediction;
// we also can't use region forwarding because we can't rely
// on the server being up
members , err := client . Agent ( ) . MembersOpts ( & api . QueryOptions { AllowStale : true } )
2021-10-13 00:01:54 +00:00
if err != nil {
return [ ] string { }
}
// Iterate over server members looking for match
filtered := [ ] string { }
for _ , member := range members . Members {
if strings . HasPrefix ( member . Name , a . Last ) {
filtered = append ( filtered , member . Name )
}
}
return filtered
} )
}
2021-12-15 15:44:03 +00:00
// queryOpts returns a copy of the shared api.QueryOptions so
// that api package methods can safely modify the options
func ( c * OperatorDebugCommand ) queryOpts ( ) * api . QueryOptions {
qo := new ( api . QueryOptions )
* qo = * c . opts
2022-09-21 19:53:25 +00:00
qo . Params = maps . Clone ( c . opts . Params )
2021-12-15 15:44:03 +00:00
return qo
}
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Name ( ) string { return "debug" }
2020-06-25 16:51:23 +00:00
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) Run ( args [ ] string ) int {
2020-06-25 16:51:23 +00:00
flags := c . Meta . FlagSet ( c . Name ( ) , FlagSetClient )
flags . Usage = func ( ) { c . Ui . Output ( c . Help ( ) ) }
2022-04-04 19:24:12 +00:00
var duration , interval , pprofInterval , output , pprofDuration , eventTopic string
2022-01-18 02:35:51 +00:00
var eventIndex int64
2020-06-25 16:51:23 +00:00
var nodeIDs , serverIDs string
2021-12-15 15:44:03 +00:00
var allowStale bool
2020-06-25 16:51:23 +00:00
flags . StringVar ( & duration , "duration" , "2m" , "" )
2022-01-18 02:35:51 +00:00
flags . Int64Var ( & eventIndex , "event-index" , 0 , "" )
flags . StringVar ( & eventTopic , "event-topic" , "none" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & interval , "interval" , "30s" , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & c . logLevel , "log-level" , "DEBUG" , "" )
2020-11-12 16:25:28 +00:00
flags . IntVar ( & c . maxNodes , "max-nodes" , 10 , "" )
flags . StringVar ( & c . nodeClass , "node-class" , "" , "" )
2021-10-27 17:43:56 +00:00
flags . StringVar ( & nodeIDs , "node-id" , "all" , "" )
2021-03-09 13:31:38 +00:00
flags . StringVar ( & serverIDs , "server-id" , "all" , "" )
2021-12-15 15:44:03 +00:00
flags . BoolVar ( & allowStale , "stale" , false , "" )
2020-06-25 16:51:23 +00:00
flags . StringVar ( & output , "output" , "" , "" )
2020-12-01 17:36:05 +00:00
flags . StringVar ( & pprofDuration , "pprof-duration" , "1s" , "" )
2022-04-04 19:24:12 +00:00
flags . StringVar ( & pprofInterval , "pprof-interval" , "250ms" , "" )
2022-01-18 02:35:51 +00:00
flags . BoolVar ( & c . verbose , "verbose" , false , "" )
2020-08-11 17:14:28 +00:00
c . consul = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . consul . addrVal , "consul-http-addr" , os . Getenv ( "CONSUL_HTTP_ADDR" ) , "" )
ssl := os . Getenv ( "CONSUL_HTTP_SSL" )
c . consul . ssl , _ = strconv . ParseBool ( ssl )
flags . StringVar ( & c . consul . auth , "consul-auth" , os . Getenv ( "CONSUL_HTTP_AUTH" ) , "" )
flags . StringVar ( & c . consul . tokenVal , "consul-token" , os . Getenv ( "CONSUL_HTTP_TOKEN" ) , "" )
flags . StringVar ( & c . consul . tokenFile , "consul-token-file" , os . Getenv ( "CONSUL_HTTP_TOKEN_FILE" ) , "" )
flags . StringVar ( & c . consul . tls . ClientCert , "consul-client-cert" , os . Getenv ( "CONSUL_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . consul . tls . ClientKey , "consul-client-key" , os . Getenv ( "CONSUL_CLIENT_KEY" ) , "" )
flags . StringVar ( & c . consul . tls . CACert , "consul-ca-cert" , os . Getenv ( "CONSUL_CACERT" ) , "" )
flags . StringVar ( & c . consul . tls . CAPath , "consul-ca-path" , os . Getenv ( "CONSUL_CAPATH" ) , "" )
c . vault = & external { tls : & api . TLSConfig { } }
flags . StringVar ( & c . vault . addrVal , "vault-address" , os . Getenv ( "VAULT_ADDR" ) , "" )
flags . StringVar ( & c . vault . tokenVal , "vault-token" , os . Getenv ( "VAULT_TOKEN" ) , "" )
flags . StringVar ( & c . vault . tls . CACert , "vault-ca-cert" , os . Getenv ( "VAULT_CACERT" ) , "" )
flags . StringVar ( & c . vault . tls . CAPath , "vault-ca-path" , os . Getenv ( "VAULT_CAPATH" ) , "" )
flags . StringVar ( & c . vault . tls . ClientCert , "vault-client-cert" , os . Getenv ( "VAULT_CLIENT_CERT" ) , "" )
flags . StringVar ( & c . vault . tls . ClientKey , "vault-client-key" , os . Getenv ( "VAULT_CLIENT_KEY" ) , "" )
2020-06-25 16:51:23 +00:00
if err := flags . Parse ( args ) ; err != nil {
2020-10-06 02:30:01 +00:00
c . Ui . Error ( fmt . Sprintf ( "Error parsing arguments: %q" , err ) )
2020-06-25 16:51:23 +00:00
return 1
}
2020-11-12 16:25:28 +00:00
// Parse the capture duration
2020-06-25 16:51:23 +00:00
d , err := time . ParseDuration ( duration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing duration: %s: %s" , duration , err . Error ( ) ) )
return 1
}
c . duration = d
2020-11-12 16:25:28 +00:00
// Parse the capture interval
2020-06-25 16:51:23 +00:00
i , err := time . ParseDuration ( interval )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s: %s" , interval , err . Error ( ) ) )
return 1
}
c . interval = i
2021-03-09 13:31:38 +00:00
// Validate interval
if i . Seconds ( ) > d . Seconds ( ) {
c . Ui . Error ( fmt . Sprintf ( "Error parsing interval: %s is greater than duration %s" , interval , duration ) )
return 1
}
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
// Parse and clamp the pprof capture duration
2020-12-01 17:36:05 +00:00
pd , err := time . ParseDuration ( pprofDuration )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing pprof duration: %s: %s" , pprofDuration , err . Error ( ) ) )
return 1
}
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
if pd . Seconds ( ) > d . Seconds ( ) {
pd = d
}
2020-12-01 17:36:05 +00:00
c . pprofDuration = pd
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
// Parse and clamp the pprof capture interval
pi , err := time . ParseDuration ( pprofInterval )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing pprof-interval: %s: %s" , pprofInterval , err . Error ( ) ) )
2022-04-04 19:24:12 +00:00
return 1
}
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
if pi . Seconds ( ) > pd . Seconds ( ) {
pi = pd
}
c . pprofInterval = pi
2022-04-04 19:24:12 +00:00
2022-01-18 02:35:51 +00:00
// Parse event stream topic filter
t , err := topicsFromString ( eventTopic )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error parsing event topics: %v" , err ) )
return 1
}
c . topics = t
// Validate and set initial event stream index
if eventIndex < 0 {
c . Ui . Error ( "Event stream index must be greater than zero" )
return 1
}
c . index = uint64 ( eventIndex )
2020-11-12 16:25:28 +00:00
// Verify there are no extra arguments
2020-06-25 16:51:23 +00:00
args = flags . Args ( )
if l := len ( args ) ; l != 0 {
c . Ui . Error ( "This command takes no arguments" )
c . Ui . Error ( commandErrorText ( c ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Initialize capture variables and structs
c . manifest = make ( [ ] string , 0 )
ctx , cancel := context . WithCancel ( context . Background ( ) )
c . ctx = ctx
c . cancel = cancel
c . trap ( )
// Generate timestamped file name
format := "2006-01-02-150405Z"
c . timestamp = time . Now ( ) . UTC ( ) . Format ( format )
stamped := "nomad-debug-" + c . timestamp
// Create the output directory
var tmp string
if output != "" {
// User specified output directory
tmp = filepath . Join ( output , stamped )
_ , err := os . Stat ( tmp )
if ! os . IsNotExist ( err ) {
c . Ui . Error ( "Output directory already exists" )
return 2
}
} else {
// Generate temp directory
2023-03-08 15:20:04 +00:00
tmp , err = os . MkdirTemp ( os . TempDir ( ) , stamped )
2020-11-12 16:25:28 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating tmp directory: %s" , err . Error ( ) ) )
return 2
}
defer os . RemoveAll ( tmp )
}
c . collectDir = tmp
2021-11-05 23:43:10 +00:00
// Write CLI flags to JSON file
c . writeFlags ( flags )
2020-11-12 16:25:28 +00:00
// Create an instance of the API client
2020-06-25 16:51:23 +00:00
client , err := c . Meta . Client ( )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error initializing client: %s" , err . Error ( ) ) )
return 1
}
2021-12-15 15:44:03 +00:00
c . opts = & api . QueryOptions {
Region : c . Meta . region ,
AllowStale : allowStale ,
AuthToken : c . Meta . token ,
}
2022-04-28 17:18:55 +00:00
// Get complete list of client nodes
c . nodes , _ , err = client . Nodes ( ) . List ( c . queryOpts ( ) )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error querying node info: %v" , err ) )
return 1
}
// Write nodes to file
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseToFile ( c . nodes , c . newFile ( clusterDir , "nodes.json" ) ) )
2022-04-28 17:18:55 +00:00
2020-11-12 16:25:28 +00:00
// Search all nodes If a node class is specified without a list of node id prefixes
if c . nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
}
// Resolve client node id prefixes
nodesFound := 0
nodeLookupFailCount := 0
nodeCaptureCount := 0
2021-10-12 20:58:41 +00:00
for _ , id := range stringToSlice ( nodeIDs ) {
2020-11-12 16:25:28 +00:00
if id == "all" {
// Capture from all nodes using empty prefix filter
id = ""
} else {
// Capture from nodes starting with prefix id
id = sanitizeUUIDPrefix ( id )
}
2021-12-15 15:44:03 +00:00
nodes , _ , err := client . Nodes ( ) . PrefixListOpts ( id , c . queryOpts ( ) )
2020-06-25 16:51:23 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error querying node info: %s" , err ) )
return 1
}
2020-11-12 16:25:28 +00:00
// Increment fail count if no nodes are found
2020-12-14 20:02:48 +00:00
if len ( nodes ) == 0 {
2020-06-25 16:51:23 +00:00
c . Ui . Error ( fmt . Sprintf ( "No node(s) with prefix %q found" , id ) )
2020-11-12 16:25:28 +00:00
nodeLookupFailCount ++
continue
2020-06-25 16:51:23 +00:00
}
2020-12-14 20:02:48 +00:00
nodesFound += len ( nodes )
2020-11-12 16:25:28 +00:00
// Apply constraints to nodes found
2020-06-25 16:51:23 +00:00
for _ , n := range nodes {
2020-11-12 16:25:28 +00:00
// Ignore nodes that do not match specified class
if c . nodeClass != "" && n . NodeClass != c . nodeClass {
continue
}
// Add node to capture list
2020-06-25 16:51:23 +00:00
c . nodeIDs = append ( c . nodeIDs , n . ID )
2020-11-12 16:25:28 +00:00
nodeCaptureCount ++
// Stop looping when we reach the max
if c . maxNodes != 0 && nodeCaptureCount >= c . maxNodes {
break
}
2020-06-25 16:51:23 +00:00
}
}
2020-11-12 16:25:28 +00:00
// Return error if nodes were specified but none were found
if len ( nodeIDs ) > 0 && nodeCaptureCount == 0 {
2021-10-27 17:43:56 +00:00
if nodeIDs == "all" {
// It's okay to have zero clients for default "all"
c . Ui . Info ( "Note: \"-node-id=all\" specified but no clients found" )
} else {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve clients, 0 nodes found in list: %s" , nodeIDs ) )
return 1
}
2020-11-12 16:25:28 +00:00
}
2020-10-14 19:16:10 +00:00
// Resolve servers
2022-04-28 17:18:55 +00:00
c . members , err = client . Agent ( ) . MembersOpts ( c . queryOpts ( ) )
2020-11-12 16:25:28 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve server list; err: %v" , err ) )
return 1
}
2021-10-12 20:58:41 +00:00
// Write complete list of server members to file
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseToFile ( c . members , c . newFile ( clusterDir , "members.json" ) ) )
2021-10-12 20:58:41 +00:00
2022-07-06 14:57:44 +00:00
// Get leader and write to file; there's no option for AllowStale
// on this API and a stale result wouldn't even be meaningful, so
// only warn if we fail so that we don't stop the rest of the
// debugging
leader , err := client . Status ( ) . Leader ( )
if err != nil {
c . Ui . Warn ( fmt . Sprintf ( "Failed to retrieve leader; err: %v" , err ) )
}
if len ( leader ) > 0 {
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseToFile ( leader , c . newFile ( clusterDir , "leader.json" ) ) )
2022-07-06 14:57:44 +00:00
}
2021-10-12 20:58:41 +00:00
// Filter for servers matching criteria
2022-04-28 17:18:55 +00:00
c . serverIDs , err = filterServerMembers ( c . members , serverIDs , c . region )
2021-10-12 20:58:41 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Failed to parse server list; err: %v" , err ) )
return 1
2020-10-14 19:16:10 +00:00
}
2020-11-12 16:25:28 +00:00
serversFound := 0
serverCaptureCount := 0
2022-04-28 17:18:55 +00:00
if c . members != nil {
serversFound = len ( c . members . Members )
2020-11-12 16:25:28 +00:00
}
if c . serverIDs != nil {
serverCaptureCount = len ( c . serverIDs )
}
2020-10-14 19:16:10 +00:00
// Return error if servers were specified but not found
2020-11-12 16:25:28 +00:00
if len ( serverIDs ) > 0 && serverCaptureCount == 0 {
2020-10-14 19:16:10 +00:00
c . Ui . Error ( fmt . Sprintf ( "Failed to retrieve servers, 0 members found in list: %s" , serverIDs ) )
return 1
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
// Display general info about the capture
c . Ui . Output ( "Starting debugger..." )
c . Ui . Output ( "" )
2021-11-05 23:43:10 +00:00
c . Ui . Output ( fmt . Sprintf ( "Nomad CLI Version: %s" , version . GetVersion ( ) . FullVersionNumber ( true ) ) )
2021-10-12 20:58:41 +00:00
c . Ui . Output ( fmt . Sprintf ( " Region: %s" , c . region ) )
c . Ui . Output ( fmt . Sprintf ( " Namespace: %s" , c . namespace ) )
2020-11-12 16:25:28 +00:00
c . Ui . Output ( fmt . Sprintf ( " Servers: (%d/%d) %v" , serverCaptureCount , serversFound , c . serverIDs ) )
c . Ui . Output ( fmt . Sprintf ( " Clients: (%d/%d) %v" , nodeCaptureCount , nodesFound , c . nodeIDs ) )
if nodeCaptureCount > 0 && nodeCaptureCount == c . maxNodes {
c . Ui . Output ( fmt . Sprintf ( " Max node count reached (%d)" , c . maxNodes ) )
2020-06-25 16:51:23 +00:00
}
2020-11-12 16:25:28 +00:00
if nodeLookupFailCount > 0 {
c . Ui . Output ( fmt . Sprintf ( "Client fail count: %v" , nodeLookupFailCount ) )
}
if c . nodeClass != "" {
c . Ui . Output ( fmt . Sprintf ( " Node Class: %s" , c . nodeClass ) )
}
c . Ui . Output ( fmt . Sprintf ( " Interval: %s" , interval ) )
c . Ui . Output ( fmt . Sprintf ( " Duration: %s" , duration ) )
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
c . Ui . Output ( fmt . Sprintf ( " pprof Interval: %s" , pprofInterval ) )
2020-12-01 17:36:05 +00:00
if c . pprofDuration . Seconds ( ) != 1 {
c . Ui . Output ( fmt . Sprintf ( " pprof Duration: %s" , c . pprofDuration ) )
}
2022-01-18 02:35:51 +00:00
if c . topics != nil {
c . Ui . Output ( fmt . Sprintf ( " Event topics: %+v" , c . topics ) )
}
2020-11-12 16:25:28 +00:00
c . Ui . Output ( "" )
c . Ui . Output ( "Capturing cluster data..." )
2020-06-25 16:51:23 +00:00
2020-11-12 16:25:28 +00:00
// Start collecting data
2020-06-25 16:51:23 +00:00
err = c . collect ( client )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error collecting data: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Write index json/html manifest files
2020-06-25 16:51:23 +00:00
c . writeManifest ( )
2020-11-12 16:25:28 +00:00
// Exit before archive if output directory was specified
2020-06-25 16:51:23 +00:00
if output != "" {
c . Ui . Output ( fmt . Sprintf ( "Created debug directory: %s" , c . collectDir ) )
return 0
}
2020-11-12 16:25:28 +00:00
// Create archive tarball
2020-06-25 16:51:23 +00:00
archiveFile := stamped + ".tar.gz"
2020-08-11 17:14:28 +00:00
err = TarCZF ( archiveFile , tmp , stamped )
2020-06-25 16:51:23 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error creating archive: %s" , err . Error ( ) ) )
return 2
}
2020-11-12 16:25:28 +00:00
// Final output with name of tarball
2020-06-25 16:51:23 +00:00
c . Ui . Output ( fmt . Sprintf ( "Created debug archive: %s" , archiveFile ) )
return 0
}
// collect collects data from our endpoints and writes the archive bundle
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collect ( client * api . Client ) error {
2022-01-18 02:35:51 +00:00
// Start background captures
c . startMonitors ( client )
c . startEventStream ( client )
2020-06-25 16:51:23 +00:00
2022-01-18 02:35:51 +00:00
// Collect cluster data
2020-06-25 16:51:23 +00:00
self , err := client . Agent ( ) . Self ( )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile (
self , err , c . newFile ( clusterDir , "agent-self.json" ) ) )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
namespaces , _ , err := client . Namespaces ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile (
namespaces , err , c . newFile ( clusterDir , "namespaces.json" ) ) )
2021-10-12 20:58:41 +00:00
regions , err := client . Regions ( ) . List ( )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile (
regions , err , c . newFile ( clusterDir , "regions.json" ) ) )
2021-10-12 20:58:41 +00:00
2021-11-05 23:43:10 +00:00
// Collect data from Consul
if c . consul . addrVal == "" {
c . getConsulAddrFromSelf ( self )
}
c . collectConsul ( clusterDir )
2020-06-25 16:51:23 +00:00
2021-11-05 23:43:10 +00:00
// Collect data from Vault
vaultAddr := c . vault . addrVal
if vaultAddr == "" {
vaultAddr = c . getVaultAddrFromSelf ( self )
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
c . collectVault ( clusterDir , vaultAddr )
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
c . collectAgentHosts ( client )
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
c . collectPeriodicPprofs ( client )
2020-06-25 16:51:23 +00:00
c . collectPeriodic ( client )
return nil
}
// path returns platform specific paths in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) path ( paths ... string ) string {
2020-06-25 16:51:23 +00:00
ps := [ ] string { c . collectDir }
ps = append ( ps , paths ... )
return filepath . Join ( ps ... )
}
// mkdir creates directories in the tmp root directory
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) mkdir ( paths ... string ) error {
2020-12-01 17:36:05 +00:00
joinedPath := c . path ( paths ... )
// Ensure path doesn't escape the sandbox of the capture directory
2022-08-24 14:56:42 +00:00
escapes := escapingfs . PathEscapesSandbox ( c . collectDir , joinedPath )
2020-12-01 17:36:05 +00:00
if escapes {
return fmt . Errorf ( "file path escapes capture directory" )
}
2022-09-22 14:02:00 +00:00
return escapingfs . EnsurePath ( joinedPath , true )
2020-06-25 16:51:23 +00:00
}
// startMonitors starts go routines for each node and client
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitors ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
for _ , id := range c . nodeIDs {
2021-10-13 22:00:55 +00:00
go c . startMonitor ( clientDir , "node_id" , id , client )
2020-06-25 16:51:23 +00:00
}
for _ , id := range c . serverIDs {
2021-10-13 22:00:55 +00:00
go c . startMonitor ( serverDir , "server_id" , id , client )
2020-06-25 16:51:23 +00:00
}
}
// startMonitor starts one monitor api request, writing to a file. It blocks and should be
// called in a go routine. Errors are ignored, we want to build the archive even if a node
// is unavailable
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) startMonitor ( path , idKey , nodeID string , client * api . Client ) {
2020-06-25 16:51:23 +00:00
c . mkdir ( path , nodeID )
fh , err := os . Create ( c . path ( path , nodeID , "monitor.log" ) )
if err != nil {
return
}
defer fh . Close ( )
qo := api . QueryOptions {
Params : map [ string ] string {
idKey : nodeID ,
"log_level" : c . logLevel ,
} ,
2021-12-15 15:44:03 +00:00
AllowStale : c . queryOpts ( ) . AllowStale ,
2020-06-25 16:51:23 +00:00
}
outCh , errCh := client . Agent ( ) . Monitor ( c . ctx . Done ( ) , & qo )
for {
select {
case out := <- outCh :
if out == nil {
continue
}
fh . Write ( out . Data )
case err := <- errCh :
fh . WriteString ( fmt . Sprintf ( "monitor: %s\n" , err . Error ( ) ) )
return
case <- c . ctx . Done ( ) :
return
}
}
}
2022-01-18 02:35:51 +00:00
// captureEventStream wraps the event stream capture process.
func ( c * OperatorDebugCommand ) startEventStream ( client * api . Client ) {
c . verboseOut ( "Launching eventstream goroutine..." )
go func ( ) {
if err := c . captureEventStream ( client ) ; err != nil {
var es string
if mErr , ok := err . ( * multierror . Error ) ; ok {
es = multierror . ListFormatFunc ( mErr . Errors )
} else {
es = err . Error ( )
}
c . Ui . Error ( fmt . Sprintf ( "Error capturing event stream: %s" , es ) )
}
} ( )
}
func ( c * OperatorDebugCommand ) captureEventStream ( client * api . Client ) error {
// Ensure output directory is present
path := clusterDir
if err := c . mkdir ( c . path ( path ) ) ; err != nil {
return err
}
// Create the output file
fh , err := os . Create ( c . path ( path , "eventstream.json" ) )
if err != nil {
return err
}
defer fh . Close ( )
// Get handle to events endpoint
events := client . EventStream ( )
// Start streaming events
eventCh , err := events . Stream ( c . ctx , c . topics , c . index , c . queryOpts ( ) )
if err != nil {
if errors . Is ( err , context . Canceled ) {
c . verboseOut ( "Event stream canceled: No events captured" )
return nil
}
return fmt . Errorf ( "failed to stream events: %w" , err )
}
eventCount := 0
errCount := 0
heartbeatCount := 0
channelEventCount := 0
var mErrs * multierror . Error
for {
select {
case event := <- eventCh :
channelEventCount ++
if event . Err != nil {
errCount ++
c . verboseOutf ( "error from event stream: index; %d err: %v" , event . Index , event . Err )
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "error at index: %d, Err: %w" , event . Index , event . Err ) )
break
}
if event . IsHeartbeat ( ) {
heartbeatCount ++
continue
}
for _ , e := range event . Events {
eventCount ++
c . verboseOutf ( "Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s" , eventCount , e . Index , e . Topic , e . Type , e . FilterKeys )
bytes , err := json . Marshal ( e )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to marshal json from Topic: %s, Type: %s, Err: %w" , e . Topic , e . Type , err ) )
}
n , err := fh . Write ( bytes )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to write bytes to eventstream.json; bytes written: %d, Err: %w" , n , err ) )
break
}
n , err = fh . WriteString ( "\n" )
if err != nil {
errCount ++
mErrs = multierror . Append ( mErrs , fmt . Errorf ( "failed to write string to eventstream.json; chars written: %d, Err: %w" , n , err ) )
}
}
case <- c . ctx . Done ( ) :
c . verboseOutf ( "Event stream captured %d events, %d frames, %d heartbeats, %d errors" , eventCount , channelEventCount , heartbeatCount , errCount )
return mErrs . ErrorOrNil ( )
}
}
}
2020-07-02 13:51:25 +00:00
// collectAgentHosts calls collectAgentHost for each selected node
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHosts ( client * api . Client ) {
2020-07-02 13:51:25 +00:00
for _ , n := range c . nodeIDs {
2021-10-13 22:00:55 +00:00
c . collectAgentHost ( clientDir , n , client )
2020-07-02 13:51:25 +00:00
}
for _ , n := range c . serverIDs {
2021-10-13 22:00:55 +00:00
c . collectAgentHost ( serverDir , n , client )
2020-07-02 13:51:25 +00:00
}
}
// collectAgentHost gets the agent host data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectAgentHost ( path , id string , client * api . Client ) {
2020-07-02 13:51:25 +00:00
var host * api . HostDataResponse
var err error
2021-10-13 22:00:55 +00:00
if path == serverDir {
2021-12-15 15:44:03 +00:00
host , err = client . Agent ( ) . Host ( id , "" , c . queryOpts ( ) )
2020-07-02 13:51:25 +00:00
} else {
2021-12-15 15:44:03 +00:00
host , err = client . Agent ( ) . Host ( "" , id , c . queryOpts ( ) )
2020-07-02 13:51:25 +00:00
}
2022-01-17 16:15:17 +00:00
if isRedirectError ( err ) {
c . Ui . Warn ( fmt . Sprintf ( "%s/%s: /v1/agent/host unavailable on this agent" , path , id ) )
return
}
2020-12-01 17:36:05 +00:00
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "%s/%s: Failed to retrieve agent host data, err: %v" , path , id , err ) )
2020-07-02 13:51:25 +00:00
2022-08-02 14:33:08 +00:00
if strings . Contains ( err . Error ( ) , api . PermissionDeniedErrorContent ) {
2020-12-01 17:36:05 +00:00
// Drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information." )
2020-12-01 17:36:05 +00:00
}
return // exit on any error
}
path = filepath . Join ( path , id )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseToFile ( host , c . newFile ( path , "agent-host.json" ) ) )
2020-07-02 13:51:25 +00:00
}
2022-04-04 19:24:12 +00:00
func ( c * OperatorDebugCommand ) collectPeriodicPprofs ( client * api . Client ) {
2022-04-28 17:18:55 +00:00
pprofNodeIDs := [ ] string { }
pprofServerIDs := [ ] string { }
// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
for _ , serverID := range c . serverIDs {
version := c . getNomadVersion ( serverID , "" )
err := checkVersion ( version , minimumVersionPprofConstraint )
if err != nil {
c . Ui . Warn ( fmt . Sprintf ( "Skipping pprof: %v" , err ) )
}
pprofServerIDs = append ( pprofServerIDs , serverID )
}
for _ , nodeID := range c . nodeIDs {
version := c . getNomadVersion ( "" , nodeID )
err := checkVersion ( version , minimumVersionPprofConstraint )
if err != nil {
c . Ui . Warn ( fmt . Sprintf ( "Skipping pprof: %v" , err ) )
}
pprofNodeIDs = append ( pprofNodeIDs , nodeID )
}
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
// Take the first set of pprofs synchronously...
c . Ui . Output ( " Capture pprofInterval 0000" )
2022-04-28 17:18:55 +00:00
c . collectPprofs ( client , pprofServerIDs , pprofNodeIDs , 0 )
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
if c . pprofInterval == c . pprofDuration {
return
}
2022-04-04 19:24:12 +00:00
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
// ... and then move the rest off into a goroutine
go func ( ) {
ctx , cancel := context . WithTimeout ( c . ctx , c . duration )
defer cancel ( )
timer , stop := helper . NewSafeTimer ( c . pprofInterval )
defer stop ( )
pprofIntervalCount := 1
for {
select {
case <- ctx . Done ( ) :
return
case <- timer . C :
c . Ui . Output ( fmt . Sprintf ( " Capture pprofInterval %04d" , pprofIntervalCount ) )
2022-04-28 17:18:55 +00:00
c . collectPprofs ( client , pprofServerIDs , pprofNodeIDs , pprofIntervalCount )
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
timer . Reset ( c . pprofInterval )
pprofIntervalCount ++
}
2022-04-04 19:24:12 +00:00
}
Fix flaky `operator debug` test (#12501)
We introduced a `pprof-interval` argument to `operator debug` in #11938, and unfortunately this has resulted in a lot of test flakes. The actual command in use is mostly fine (although I've fixed some quirks here), so what's really happened is that the change has revealed some existing issues in the tests. Summary of changes:
* Make first pprof collection synchronous to preserve the existing
behavior for the common case where the pprof interval matches the
duration.
* Clamp `operator debug` pprof timing to that of the command. The
`pprof-duration` should be no more than `duration` and the
`pprof-interval` should be no more than `pprof-duration`. Clamp the
values rather than throwing errors, which could change the commands
that existing users might already have in debugging scripts
* Testing: remove test parallelism
The `operator debug` tests that stand up servers can't be run in
parallel, because we don't have a way of canceling the API calls for
pprof. The agent will still be running the last pprof when we exit,
and that breaks the next test that talks to that same agent.
(Because you can only run one pprof at a time on any process!)
We could split off each subtest into its own server, but this test
suite is already very slow. In future work we should fix this "for
real" by making the API call cancelable.
* Testing: assert against unexpected errors in `operator debug` tests.
If we assert there are no unexpected error outputs, it's easier for
the developer to debug when something is going wrong with the tests
because the error output will be presented as a failing test, rather
than just a failing exit code check. Or worse, no failing exit code
check!
This also forces us to be explicit about which tests will return 0
exit codes but still emit (presumably ignorable) error outputs.
Additional minor bug fixes (mostly in tests) and test refactorings:
* Fix text alignment on pprof Duration in `operator debug` output
* Remove "done" channel from `operator debug` event stream test. The
goroutine we're blocking for here already tells us it's done by
sending a value, so block on that instead of an extraneous channel
* Event stream test timer should start at current time, not zero
* Remove noise from `operator debug` test log output. The `t.Logf`
calls already are picked out from the rest of the test output by
being prefixed with the filename.
* Remove explicit pprof args so we use the defaults clamped from
duration/interval
2022-04-07 19:00:07 +00:00
} ( )
2022-04-04 19:24:12 +00:00
}
2020-06-25 16:51:23 +00:00
// collectPprofs captures the /agent/pprof for each listed node
2022-04-28 17:18:55 +00:00
func ( c * OperatorDebugCommand ) collectPprofs ( client * api . Client , serverIDs , nodeIDs [ ] string , interval int ) {
for _ , n := range nodeIDs {
2022-04-04 19:24:12 +00:00
c . collectPprof ( clientDir , n , client , interval )
2020-06-25 16:51:23 +00:00
}
2022-04-28 17:18:55 +00:00
for _ , n := range serverIDs {
2022-04-04 19:24:12 +00:00
c . collectPprof ( serverDir , n , client , interval )
2020-06-25 16:51:23 +00:00
}
}
// collectPprof captures pprof data for the node
2022-04-04 19:24:12 +00:00
func ( c * OperatorDebugCommand ) collectPprof ( path , id string , client * api . Client , interval int ) {
2020-12-01 17:36:05 +00:00
pprofDurationSeconds := int ( c . pprofDuration . Seconds ( ) )
opts := api . PprofOptions { Seconds : pprofDurationSeconds }
2021-10-13 22:00:55 +00:00
if path == serverDir {
2020-06-25 16:51:23 +00:00
opts . ServerID = id
} else {
opts . NodeID = id
}
path = filepath . Join ( path , id )
2022-04-04 19:24:12 +00:00
filename := fmt . Sprintf ( "profile_%04d.prof" , interval )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
bs , err := client . Agent ( ) . CPUProfile ( opts , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
if err != nil {
2022-04-04 19:24:12 +00:00
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof %s, err: %v" , filename , path , err ) )
2022-08-02 14:33:08 +00:00
if strings . Contains ( err . Error ( ) , api . PermissionDeniedErrorContent ) {
2020-12-01 17:36:05 +00:00
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
// But lets first drop a hint to help the operator resolve the error
2020-12-09 19:05:18 +00:00
c . Ui . Warn ( "Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information." )
2020-12-01 17:36:05 +00:00
return // only exit on 403
}
} else {
2022-04-04 19:24:12 +00:00
err := c . writeBytes ( path , filename , bs )
2020-12-08 20:47:04 +00:00
if err != nil {
2020-12-01 17:36:05 +00:00
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
2021-06-21 18:22:49 +00:00
// goroutine debug type 1 = legacy text format for human readable output
opts . Debug = 1
c . savePprofProfile ( path , "goroutine" , opts , client )
// goroutine debug type 2 = goroutine stacks in panic format
opts . Debug = 2
c . savePprofProfile ( path , "goroutine" , opts , client )
// Reset to pprof binary format
opts . Debug = 0
c . savePprofProfile ( path , "goroutine" , opts , client ) // Stack traces of all current goroutines
c . savePprofProfile ( path , "trace" , opts , client ) // A trace of execution of the current program
c . savePprofProfile ( path , "heap" , opts , client ) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c . savePprofProfile ( path , "allocs" , opts , client ) // A sampling of all past memory allocations
c . savePprofProfile ( path , "threadcreate" , opts , client ) // Stack traces that led to the creation of new OS threads
}
// savePprofProfile retrieves a pprof profile and writes to disk
func ( c * OperatorDebugCommand ) savePprofProfile ( path string , profile string , opts api . PprofOptions , client * api . Client ) {
fileName := fmt . Sprintf ( "%s.prof" , profile )
if opts . Debug > 0 {
fileName = fmt . Sprintf ( "%s-debug%d.txt" , profile , opts . Debug )
2020-06-25 16:51:23 +00:00
}
2021-12-15 15:44:03 +00:00
bs , err := retrievePprofProfile ( profile , opts , client , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
if err != nil {
2021-06-21 18:22:49 +00:00
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to retrieve pprof %s, err: %s" , path , fileName , err . Error ( ) ) )
2020-06-25 16:51:23 +00:00
}
2020-10-14 19:16:10 +00:00
2021-06-21 18:22:49 +00:00
err = c . writeBytes ( path , fileName , bs )
2020-12-01 17:36:05 +00:00
if err != nil {
2021-06-21 18:22:49 +00:00
c . Ui . Error ( fmt . Sprintf ( "%s: Failed to write file %s, err: %s" , path , fileName , err . Error ( ) ) )
2020-10-14 19:16:10 +00:00
}
2021-06-21 18:22:49 +00:00
}
2020-10-14 19:16:10 +00:00
2021-12-15 15:44:03 +00:00
// retrievePprofProfile gets a pprof profile from the node specified
// in opts using the API client
func retrievePprofProfile ( profile string , opts api . PprofOptions , client * api . Client , qopts * api . QueryOptions ) ( bs [ ] byte , err error ) {
2021-06-21 18:22:49 +00:00
switch profile {
case "cpuprofile" :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . CPUProfile ( opts , qopts )
2021-06-21 18:22:49 +00:00
case "trace" :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . Trace ( opts , qopts )
2021-06-21 18:22:49 +00:00
default :
2021-12-15 15:44:03 +00:00
bs , err = client . Agent ( ) . Lookup ( profile , opts , qopts )
2020-10-14 19:16:10 +00:00
}
2021-06-21 18:22:49 +00:00
return bs , err
2020-06-25 16:51:23 +00:00
}
2021-12-15 15:44:03 +00:00
// collectPeriodic runs for duration, capturing the cluster state
// every interval. It flushes and stops the monitor requests
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectPeriodic ( client * api . Client ) {
2020-06-25 16:51:23 +00:00
duration := time . After ( c . duration )
// Set interval to 0 so that we immediately execute, wait the interval next time
interval := time . After ( 0 * time . Second )
var intervalCount int
2020-08-11 17:14:28 +00:00
var name , dir string
2020-06-25 16:51:23 +00:00
for {
select {
case <- duration :
c . cancel ( )
return
case <- interval :
2020-08-11 17:14:28 +00:00
name = fmt . Sprintf ( "%04d" , intervalCount )
2021-10-13 22:00:55 +00:00
dir = filepath . Join ( intervalDir , name )
2020-08-11 17:14:28 +00:00
c . Ui . Output ( fmt . Sprintf ( " Capture interval %s" , name ) )
2020-06-25 16:51:23 +00:00
c . collectNomad ( dir , client )
2020-08-11 17:14:28 +00:00
c . collectOperator ( dir , client )
2020-06-25 16:51:23 +00:00
interval = time . After ( c . interval )
2020-12-01 17:36:05 +00:00
intervalCount ++
2020-06-25 16:51:23 +00:00
case <- c . ctx . Done ( ) :
return
}
}
}
2020-08-11 17:14:28 +00:00
// collectOperator captures some cluster meta information
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectOperator ( dir string , client * api . Client ) {
2021-12-15 15:44:03 +00:00
rc , err := client . Operator ( ) . RaftGetConfiguration ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( rc , err , c . newFile ( dir , "operator-raft.json" ) ) )
2020-08-11 17:14:28 +00:00
2021-12-15 15:44:03 +00:00
sc , _ , err := client . Operator ( ) . SchedulerGetConfiguration ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( sc , err , c . newFile ( dir , "operator-scheduler.json" ) ) )
2020-08-11 17:14:28 +00:00
2021-12-15 15:44:03 +00:00
ah , _ , err := client . Operator ( ) . AutopilotServerHealth ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile (
ah , err , c . newFile ( dir , "operator-autopilot-health.json" ) ) )
2020-08-31 17:22:23 +00:00
2021-12-15 15:44:03 +00:00
lic , _ , err := client . Operator ( ) . LicenseGet ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( lic , err , c . newFile ( dir , "license.json" ) ) )
2020-08-11 17:14:28 +00:00
}
2020-06-25 16:51:23 +00:00
// collectNomad captures the nomad cluster state
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectNomad ( dir string , client * api . Client ) error {
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
js , _ , err := client . Jobs ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( js , err , c . newFile ( dir , "jobs.json" ) ) )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
ds , _ , err := client . Deployments ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( ds , err , c . newFile ( dir , "deployments.json" ) ) )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
es , _ , err := client . Evaluations ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( es , err , c . newFile ( dir , "evaluations.json" ) ) )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
as , _ , err := client . Allocations ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( as , err , c . newFile ( dir , "allocations.json" ) ) )
2020-06-25 16:51:23 +00:00
2021-12-15 15:44:03 +00:00
ns , _ , err := client . Nodes ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( ns , err , c . newFile ( dir , "nodes.json" ) ) )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugins - /v1/plugins?type=csi
2021-12-15 15:44:03 +00:00
ps , _ , err := client . CSIPlugins ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile ( ps , err , c . newFile ( dir , "csi-plugins.json" ) ) )
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Plugin details - /v1/plugin/csi/:plugin_id
for _ , p := range ps {
2021-12-15 15:44:03 +00:00
csiPlugin , _ , err := client . CSIPlugins ( ) . Info ( p . ID , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
csiPluginFileName := fmt . Sprintf ( "csi-plugin-id-%s.json" , p . ID )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( csiPlugin , err , c . newFile ( dir , csiPluginFileName ) ) )
2020-12-01 17:36:05 +00:00
}
2020-06-25 16:51:23 +00:00
2020-12-01 17:36:05 +00:00
// CSI Volumes - /v1/volumes?type=csi
2021-12-15 15:44:03 +00:00
csiVolumes , _ , err := client . CSIVolumes ( ) . List ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseStreamOrErrorToFile (
csiVolumes , err , c . newFile ( dir , "csi-volumes.json" ) ) )
2020-12-01 17:36:05 +00:00
// CSI Volume details - /v1/volumes/csi/:volume-id
for _ , v := range csiVolumes {
2021-12-15 15:44:03 +00:00
csiVolume , _ , err := client . CSIVolumes ( ) . Info ( v . ID , c . queryOpts ( ) )
2020-12-01 17:36:05 +00:00
csiFileName := fmt . Sprintf ( "csi-volume-id-%s.json" , v . ID )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( csiVolume , err , c . newFile ( dir , csiFileName ) ) )
2020-10-14 19:16:10 +00:00
}
2020-10-06 02:30:01 +00:00
2021-12-15 15:44:03 +00:00
metrics , _ , err := client . Operator ( ) . MetricsSummary ( c . queryOpts ( ) )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseOrErrorToFile ( metrics , err , c . newFile ( dir , "metrics.json" ) ) )
2020-12-01 17:36:05 +00:00
2020-06-25 16:51:23 +00:00
return nil
}
2021-11-05 23:43:10 +00:00
// collectConsul calls the Consul API to collect data
func ( c * OperatorDebugCommand ) collectConsul ( dir string ) {
if c . consul . addrVal == "" {
c . Ui . Output ( "Consul - Skipping, no API address found" )
return
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
c . Ui . Info ( fmt . Sprintf ( "Consul - Collecting Consul API data from: %s" , c . consul . addrVal ) )
client , err := c . consulAPIClient ( )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "failed to create Consul API client: %s" , err ) )
return
}
// Exit if we are unable to retrieve the leader
err = c . collectConsulAPIRequest ( client , "/v1/status/leader" , dir , "consul-leader.json" )
if err != nil {
c . Ui . Output ( fmt . Sprintf ( "Unable to contact Consul leader, skipping: %s" , err ) )
return
}
c . collectConsulAPI ( client , "/v1/agent/host" , dir , "consul-agent-host.json" )
c . collectConsulAPI ( client , "/v1/agent/members" , dir , "consul-agent-members.json" )
c . collectConsulAPI ( client , "/v1/agent/metrics" , dir , "consul-agent-metrics.json" )
c . collectConsulAPI ( client , "/v1/agent/self" , dir , "consul-agent-self.json" )
}
func ( c * OperatorDebugCommand ) consulAPIClient ( ) ( * http . Client , error ) {
httpClient := defaultHttpClient ( )
err := api . ConfigureTLS ( httpClient , c . consul . tls )
if err != nil {
return nil , fmt . Errorf ( "failed to configure TLS: %w" , err )
}
return httpClient , nil
}
func ( c * OperatorDebugCommand ) collectConsulAPI ( client * http . Client , urlPath string , dir string , file string ) {
err := c . collectConsulAPIRequest ( client , urlPath , dir , file )
if err != nil {
c . Ui . Error ( fmt . Sprintf ( "Error collecting from Consul API: %s" , err . Error ( ) ) )
}
}
func ( c * OperatorDebugCommand ) collectConsulAPIRequest ( client * http . Client , urlPath string , dir string , file string ) error {
url := c . consul . addrVal + urlPath
req , err := http . NewRequest ( "GET" , url , nil )
if err != nil {
return fmt . Errorf ( "failed to create HTTP request for Consul API URL=%q: %w" , url , err )
}
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req . Header . Add ( "X-Consul-Token" , c . consul . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
2021-11-05 23:43:10 +00:00
2020-06-25 16:51:23 +00:00
resp , err := client . Do ( req )
2021-11-05 23:43:10 +00:00
if err != nil {
return err
}
2020-06-25 16:51:23 +00:00
2021-11-05 23:43:10 +00:00
c . writeBody ( dir , file , resp , err )
2020-06-25 16:51:23 +00:00
return nil
}
// collectVault calls the Vault API directly to collect data
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) collectVault ( dir , vault string ) error {
2021-11-05 23:43:10 +00:00
vaultAddr := c . vault . addr ( vault )
if vaultAddr == "" {
2020-06-25 16:51:23 +00:00
return nil
}
2021-11-05 23:43:10 +00:00
c . Ui . Info ( fmt . Sprintf ( "Vault - Collecting Vault API data from: %s" , vaultAddr ) )
2020-08-11 17:14:28 +00:00
client := defaultHttpClient ( )
2021-11-05 23:43:10 +00:00
if c . vault . ssl {
err := api . ConfigureTLS ( client , c . vault . tls )
if err != nil {
return fmt . Errorf ( "failed to configure TLS: %w" , err )
}
}
req , err := http . NewRequest ( "GET" , vaultAddr + "/v1/sys/health" , nil )
if err != nil {
return fmt . Errorf ( "failed to create HTTP request for Vault API URL=%q: %w" , vaultAddr , err )
}
2020-06-25 16:51:23 +00:00
2020-08-11 17:14:28 +00:00
req . Header . Add ( "X-Vault-Token" , c . vault . token ( ) )
2020-06-25 16:51:23 +00:00
req . Header . Add ( "User-Agent" , userAgent )
resp , err := client . Do ( req )
c . writeBody ( dir , "vault-sys-health.json" , resp , err )
return nil
}
// writeBytes writes a file to the archive, recording it in the manifest
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBytes ( dir , file string , data [ ] byte ) error {
2020-12-01 17:36:05 +00:00
// Replace invalid characters in filename
filename := helper . CleanFilename ( file , "_" )
relativePath := filepath . Join ( dir , filename )
2020-10-14 19:16:10 +00:00
c . manifest = append ( c . manifest , relativePath )
dirPath := filepath . Join ( c . collectDir , dir )
2020-12-01 17:36:05 +00:00
filePath := filepath . Join ( dirPath , filename )
2020-10-14 19:16:10 +00:00
// Ensure parent directories exist
2022-09-22 14:02:00 +00:00
err := escapingfs . EnsurePath ( dirPath , true )
2020-10-14 19:16:10 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return fmt . Errorf ( "failed to create parent directories of %q: %w" , dirPath , err )
2020-12-01 17:36:05 +00:00
}
// Ensure filename doesn't escape the sandbox of the capture directory
2022-08-24 14:56:42 +00:00
escapes := escapingfs . PathEscapesSandbox ( c . collectDir , filePath )
2020-12-01 17:36:05 +00:00
if escapes {
2022-09-22 14:02:00 +00:00
return fmt . Errorf ( "file path %q escapes capture directory %q" , filePath , c . collectDir )
2020-10-14 19:16:10 +00:00
}
2020-06-25 16:51:23 +00:00
2020-10-14 19:16:10 +00:00
// Create the file
fh , err := os . Create ( filePath )
2020-06-25 16:51:23 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return fmt . Errorf ( "failed to create file %q, err: %w" , filePath , err )
2020-06-25 16:51:23 +00:00
}
defer fh . Close ( )
_ , err = fh . Write ( data )
2020-12-01 17:36:05 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return fmt . Errorf ( "Failed to write data to file %q, err: %w" , filePath , err )
2020-12-01 17:36:05 +00:00
}
return nil
2020-06-25 16:51:23 +00:00
}
2022-09-22 14:02:00 +00:00
// newFilePath returns a validated filepath rooted in the provided directory and
// path. It has been checked that it falls inside the sandbox and has been added
// to the manifest tracking.
func ( c * OperatorDebugCommand ) newFilePath ( dir , file string ) ( string , error ) {
// Replace invalid characters in filename
filename := helper . CleanFilename ( file , "_" )
relativePath := filepath . Join ( dir , filename )
c . manifest = append ( c . manifest , relativePath )
dirPath := filepath . Join ( c . collectDir , dir )
filePath := filepath . Join ( dirPath , filename )
// Ensure parent directories exist
err := escapingfs . EnsurePath ( dirPath , true )
if err != nil {
return "" , fmt . Errorf ( "failed to create parent directories of %q: %w" , dirPath , err )
}
// Ensure filename doesn't escape the sandbox of the capture directory
escapes := escapingfs . PathEscapesSandbox ( c . collectDir , filePath )
if escapes {
return "" , fmt . Errorf ( "file path %q escapes capture directory %q" , filePath , c . collectDir )
}
return filePath , nil
}
type writerGetter func ( ) ( io . WriteCloser , error )
// newFile returns a func that creates a new file for writing and returns it as
// an io.WriterCloser interface. The caller is responsible for closing the
// io.Writer when its done.
//
// Note: methods cannot be generic in go, so this function returns a function
// that closes over our command so that we can still reference the command
// object's fields to validate the file. In future iterations it might be nice
// if we could move most of the command into standalone functions.
func ( c * OperatorDebugCommand ) newFile ( dir , file string ) writerGetter {
return func ( ) ( io . WriteCloser , error ) {
filePath , err := c . newFilePath ( dir , file )
if err != nil {
return nil , err
}
writer , err := os . Create ( filePath )
if err != nil {
return nil , fmt . Errorf ( "failed to create file %q: %w" , filePath , err )
}
return writer , nil
}
}
// writeResponseToFile writes a response object to a file. It returns an error
// that the caller should report to the UI.
func writeResponseToFile ( obj any , getWriterFn writerGetter ) error {
writer , err := getWriterFn ( )
2020-08-11 17:14:28 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return err
2020-08-11 17:14:28 +00:00
}
2022-09-22 14:02:00 +00:00
defer writer . Close ( )
err = writeJSON ( obj , writer )
2020-08-11 17:14:28 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return err
2020-08-11 17:14:28 +00:00
}
2022-09-22 14:02:00 +00:00
return nil
}
// writeResponseOrErrorToFile writes a response object to a file, or the error
// for that response if one was received. It returns an error that the caller
// should report to the UI.
func writeResponseOrErrorToFile ( obj any , apiErr error , getWriterFn writerGetter ) error {
writer , err := getWriterFn ( )
2020-12-01 17:36:05 +00:00
if err != nil {
2022-09-22 14:02:00 +00:00
return err
}
defer writer . Close ( )
if apiErr != nil {
obj = errorWrapper { Error : apiErr . Error ( ) }
2020-12-01 17:36:05 +00:00
}
2022-09-22 14:02:00 +00:00
err = writeJSON ( obj , writer )
if err != nil {
return err
}
return nil
}
// writeResponseStreamOrErrorToFile writes a stream of response objects to a
// file in newline-delimited JSON format, or the error for that response if one
// was received. It returns an error that the caller should report to the UI.
func writeResponseStreamOrErrorToFile [ T any ] ( obj [ ] T , apiErr error , getWriterFn writerGetter ) error {
writer , err := getWriterFn ( )
if err != nil {
return err
}
defer writer . Close ( )
if apiErr != nil {
2022-10-20 19:53:58 +00:00
wrapped := errorWrapper { Error : apiErr . Error ( ) }
2022-09-22 14:02:00 +00:00
return writeJSON ( wrapped , writer )
}
err = writeNDJSON ( obj , writer )
if err != nil {
return err
}
return nil
}
// writeNDJSON writes a single Nomad API objects (or response error) to the
// archive file as a JSON object.
func writeJSON ( obj any , writer io . Writer ) error {
buf , err := json . Marshal ( obj )
if err != nil {
buf , err = json . Marshal ( errorWrapper { Error : err . Error ( ) } )
if err != nil {
return fmt . Errorf ( "could not serialize our own error: %v" , err )
}
}
n , err := writer . Write ( buf )
if err != nil {
return fmt . Errorf ( "write error, wrote %d bytes of %d: %v" , n , len ( buf ) , err )
}
return nil
}
// writeNDJSON writes a slice of Nomad API objects to the archive file as
// newline-delimited JSON objects.
func writeNDJSON [ T any ] ( data [ ] T , writer io . Writer ) error {
for _ , obj := range data {
err := writeJSON ( obj , writer )
if err != nil {
return fmt . Errorf ( "failed to write to file: %w" , err )
}
_ , err = writer . Write ( [ ] byte { '\n' } )
if err != nil {
return fmt . Errorf ( "failed to write to file: %w" , err )
}
}
2020-12-01 17:36:05 +00:00
return nil
2020-08-11 17:14:28 +00:00
}
// writeError writes a JSON error object to capture errors in the debug bundle without
// reporting
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeError ( dir , file string , err error ) error {
2020-08-11 17:14:28 +00:00
bytes , err := json . Marshal ( errorWrapper { Error : err . Error ( ) } )
2020-06-25 16:51:23 +00:00
if err != nil {
return err
}
return c . writeBytes ( dir , file , bytes )
}
2020-08-11 17:14:28 +00:00
type errorWrapper struct {
Error string
}
2020-06-25 16:51:23 +00:00
// writeBody is a helper that writes the body of an http.Response to the archive
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeBody ( dir , file string , resp * http . Response , err error ) {
2020-06-25 16:51:23 +00:00
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-06-25 16:51:23 +00:00
return
}
if resp . ContentLength == 0 {
return
}
2020-08-11 17:14:28 +00:00
defer resp . Body . Close ( )
2023-03-08 15:20:04 +00:00
body , err := io . ReadAll ( resp . Body )
2020-06-25 16:51:23 +00:00
if err != nil {
2020-08-11 17:14:28 +00:00
c . writeError ( dir , file , err )
2020-12-01 17:36:05 +00:00
return
2020-06-25 16:51:23 +00:00
}
2020-12-01 17:36:05 +00:00
if err := c . writeBytes ( dir , file , body ) ; err != nil {
c . Ui . Error ( err . Error ( ) )
}
2020-06-25 16:51:23 +00:00
}
2021-11-05 23:43:10 +00:00
type flagExport struct {
Name string
Parsed bool
Actual map [ string ] * flag . Flag
Formal map [ string ] * flag . Flag
Effective map [ string ] * flag . Flag // All flags with non-empty value
Args [ ] string // arguments after flags
OsArgs [ ] string
}
// writeFlags exports the CLI flags to JSON file
func ( c * OperatorDebugCommand ) writeFlags ( flags * flag . FlagSet ) {
var f flagExport
f . Name = flags . Name ( )
f . Parsed = flags . Parsed ( )
f . Formal = make ( map [ string ] * flag . Flag )
f . Actual = make ( map [ string ] * flag . Flag )
f . Effective = make ( map [ string ] * flag . Flag )
f . Args = flags . Args ( )
f . OsArgs = os . Args
// Formal flags (all flags)
flags . VisitAll ( func ( flagA * flag . Flag ) {
f . Formal [ flagA . Name ] = flagA
// Determine which of thees are "effective" flags by comparing to empty string
if flagA . Value . String ( ) != "" {
f . Effective [ flagA . Name ] = flagA
}
} )
// Actual flags (everything passed on cmdline)
flags . Visit ( func ( flag * flag . Flag ) {
f . Actual [ flag . Name ] = flag
} )
2022-09-22 14:02:00 +00:00
c . reportErr ( writeResponseToFile ( f , c . newFile ( clusterDir , "cli-flags.json" ) ) )
}
func ( c * OperatorDebugCommand ) reportErr ( err error ) {
if err != nil {
c . Ui . Error ( err . Error ( ) )
}
2021-11-05 23:43:10 +00:00
}
2020-06-25 16:51:23 +00:00
// writeManifest creates the index files
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) writeManifest ( ) error {
2020-06-25 16:51:23 +00:00
// Write the JSON
path := filepath . Join ( c . collectDir , "index.json" )
jsonFh , err := os . Create ( path )
if err != nil {
return err
}
defer jsonFh . Close ( )
json . NewEncoder ( jsonFh ) . Encode ( c . manifest )
// Write the HTML
path = filepath . Join ( c . collectDir , "index.html" )
htmlFh , err := os . Create ( path )
if err != nil {
return err
}
defer htmlFh . Close ( )
head , _ := template . New ( "head" ) . Parse ( "<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>" )
line , _ := template . New ( "line" ) . Parse ( "<li><a href=\"{{.}}\">{{.}}</a></li>\n" )
if err != nil {
return fmt . Errorf ( "%v" , err )
}
tail := "</ul></body></html>\n"
head . Execute ( htmlFh , c . timestamp )
for _ , f := range c . manifest {
line . Execute ( htmlFh , f )
}
htmlFh . WriteString ( tail )
return nil
}
// trap captures signals, and closes stopCh
2020-08-11 19:39:44 +00:00
func ( c * OperatorDebugCommand ) trap ( ) {
2020-06-25 16:51:23 +00:00
sigCh := make ( chan os . Signal , 1 )
signal . Notify ( sigCh ,
syscall . SIGHUP ,
syscall . SIGINT ,
syscall . SIGTERM ,
syscall . SIGQUIT )
go func ( ) {
<- sigCh
c . cancel ( )
} ( )
}
2022-01-18 02:35:51 +00:00
func ( c * OperatorDebugCommand ) verboseOut ( out string ) {
if c . verbose {
c . Ui . Output ( out )
}
}
func ( c * OperatorDebugCommand ) verboseOutf ( format string , a ... interface { } ) {
c . verboseOut ( fmt . Sprintf ( format , a ... ) )
}
2021-08-30 09:08:12 +00:00
// TarCZF like the tar command, recursively builds a gzip compressed tar
// archive from a directory. If not empty, all files in the bundle are prefixed
// with the target path.
2020-08-11 17:14:28 +00:00
func TarCZF ( archive string , src , target string ) error {
2020-06-25 16:51:23 +00:00
// ensure the src actually exists before trying to tar it
if _ , err := os . Stat ( src ) ; err != nil {
return fmt . Errorf ( "Unable to tar files - %v" , err . Error ( ) )
}
// create the archive
fh , err := os . Create ( archive )
if err != nil {
return err
}
defer fh . Close ( )
zz := gzip . NewWriter ( fh )
defer zz . Close ( )
tw := tar . NewWriter ( zz )
defer tw . Close ( )
// tar
return filepath . Walk ( src , func ( file string , fi os . FileInfo , err error ) error {
// return on any error
if err != nil {
return err
}
if ! fi . Mode ( ) . IsRegular ( ) {
return nil
}
header , err := tar . FileInfoHeader ( fi , fi . Name ( ) )
if err != nil {
return err
}
// remove leading path to the src, so files are relative to the archive
2021-04-03 07:50:23 +00:00
path := strings . ReplaceAll ( file , src , "" )
2020-08-11 17:14:28 +00:00
if target != "" {
path = filepath . Join ( [ ] string { target , path } ... )
}
path = strings . TrimPrefix ( path , string ( filepath . Separator ) )
header . Name = path
2020-06-25 16:51:23 +00:00
if err := tw . WriteHeader ( header ) ; err != nil {
return err
}
// copy the file contents
f , err := os . Open ( file )
if err != nil {
return err
}
if _ , err := io . Copy ( tw , f ) ; err != nil {
return err
}
f . Close ( )
return nil
} )
}
2021-10-12 20:58:41 +00:00
// filterServerMembers returns a slice of server member names matching the search criteria
func filterServerMembers ( serverMembers * api . ServerMembers , serverIDs string , region string ) ( membersFound [ ] string , err error ) {
if serverMembers . Members == nil {
return nil , fmt . Errorf ( "Failed to parse server members, members==nil" )
}
prefixes := stringToSlice ( serverIDs )
// "leader" is a special case which Nomad handles in the API. If "leader"
// appears in serverIDs, add it to membersFound and remove it from the list
// so that it isn't processed by the range loop
2022-09-21 19:53:25 +00:00
if slices . Contains ( prefixes , "leader" ) {
2021-10-12 20:58:41 +00:00
membersFound = append ( membersFound , "leader" )
helper . RemoveEqualFold ( & prefixes , "leader" )
}
for _ , member := range serverMembers . Members {
// If region is provided it must match exactly
if region != "" && member . Tags [ "region" ] != region {
continue
}
// Always include "all"
if serverIDs == "all" {
membersFound = append ( membersFound , member . Name )
continue
}
// Include member if name matches any prefix from serverIDs
if helper . StringHasPrefixInSlice ( member . Name , prefixes ) {
membersFound = append ( membersFound , member . Name )
}
}
return membersFound , nil
}
// stringToSlice splits comma-separated input string into slice, trims
// whitespace, and prunes empty values
func stringToSlice ( input string ) [ ] string {
2020-06-25 16:51:23 +00:00
ns := strings . Split ( input , "," )
var out [ ] string
for _ , n := range ns {
s := strings . TrimSpace ( n )
if s == "" {
continue
}
out = append ( out , s )
}
return out
}
2020-08-11 17:14:28 +00:00
2022-01-18 02:35:51 +00:00
func parseEventTopics ( topicList [ ] string ) ( map [ api . Topic ] [ ] string , error ) {
topics := make ( map [ api . Topic ] [ ] string )
var mErrs * multierror . Error
for _ , topic := range topicList {
k , v , err := parseTopic ( topic )
if err != nil {
mErrs = multierror . Append ( mErrs , err )
}
topics [ api . Topic ( k ) ] = append ( topics [ api . Topic ( k ) ] , v )
}
return topics , mErrs . ErrorOrNil ( )
}
func parseTopic ( input string ) ( string , string , error ) {
var topic , filter string
parts := strings . Split ( input , ":" )
switch len ( parts ) {
case 1 :
// infer wildcard if only given a topic
topic = input
filter = "*"
case 2 :
topic = parts [ 0 ]
filter = parts [ 1 ]
default :
return "" , "" , fmt . Errorf ( "Invalid key value pair for topic: %s" , topic )
}
return strings . Title ( topic ) , filter , nil
}
func allTopics ( ) map [ api . Topic ] [ ] string {
return map [ api . Topic ] [ ] string { "*" : { "*" } }
}
// topicsFromString parses a comma separated list into a topicMap
func topicsFromString ( topicList string ) ( map [ api . Topic ] [ ] string , error ) {
if topicList == "none" {
return nil , nil
}
if topicList == "all" {
return allTopics ( ) , nil
}
topics := stringToSlice ( topicList )
topicMap , err := parseEventTopics ( topics )
if err != nil {
return nil , err
}
return topicMap , nil
}
2020-08-11 17:14:28 +00:00
// external holds address configuration for Consul and Vault APIs
type external struct {
tls * api . TLSConfig
addrVal string
auth string
ssl bool
tokenVal string
tokenFile string
}
func ( e * external ) addr ( defaultAddr string ) string {
if e . addrVal == "" {
return defaultAddr
}
2021-11-05 23:43:10 +00:00
// Return address as-is if it contains a protocol
if strings . Contains ( e . addrVal , "://" ) {
return e . addrVal
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
if e . ssl {
return "https://" + e . addrVal
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
return "http://" + e . addrVal
}
func ( e * external ) setAddr ( addr string ) {
// Handle no protocol scenario first
if ! strings . Contains ( addr , "://" ) {
e . addrVal = "http://" + addr
if e . ssl {
e . addrVal = "https://" + addr
}
return
2020-08-11 17:14:28 +00:00
}
2021-11-05 23:43:10 +00:00
// Set SSL boolean based on protocol
e . ssl = false
if strings . Contains ( addr , "https" ) {
e . ssl = true
}
e . addrVal = addr
2020-08-11 17:14:28 +00:00
}
func ( e * external ) token ( ) string {
if e . tokenVal != "" {
return e . tokenVal
}
if e . tokenFile != "" {
2023-03-08 15:20:04 +00:00
bs , err := os . ReadFile ( e . tokenFile )
2020-08-11 17:14:28 +00:00
if err == nil {
return strings . TrimSpace ( string ( bs ) )
}
}
return ""
}
2021-11-05 23:43:10 +00:00
func ( c * OperatorDebugCommand ) getConsulAddrFromSelf ( self * api . AgentSelf ) string {
if self == nil {
return ""
}
var consulAddr string
r , ok := self . Config [ "Consul" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "EnableSSL" ]
c . consul . ssl , _ = raw . ( bool )
raw = m [ "Addr" ]
c . consul . setAddr ( raw . ( string ) )
raw = m [ "Auth" ]
c . consul . auth , _ = raw . ( string )
raw = m [ "Token" ]
c . consul . tokenVal = raw . ( string )
consulAddr = c . consul . addr ( "" )
}
}
return consulAddr
}
func ( c * OperatorDebugCommand ) getVaultAddrFromSelf ( self * api . AgentSelf ) string {
if self == nil {
return ""
}
var vaultAddr string
r , ok := self . Config [ "Vault" ]
if ok {
m , ok := r . ( map [ string ] interface { } )
if ok {
raw := m [ "EnableSSL" ]
c . vault . ssl , _ = raw . ( bool )
raw = m [ "Addr" ]
c . vault . setAddr ( raw . ( string ) )
raw = m [ "Auth" ]
c . vault . auth , _ = raw . ( string )
raw = m [ "Token" ]
c . vault . tokenVal = raw . ( string )
vaultAddr = c . vault . addr ( "" )
}
}
return vaultAddr
}
2020-08-11 17:14:28 +00:00
// defaultHttpClient configures a basic httpClient
func defaultHttpClient ( ) * http . Client {
httpClient := cleanhttp . DefaultClient ( )
transport := httpClient . Transport . ( * http . Transport )
transport . TLSHandshakeTimeout = 10 * time . Second
transport . TLSClientConfig = & tls . Config {
MinVersion : tls . VersionTLS12 ,
}
return httpClient
}
2022-01-17 16:15:17 +00:00
// isRedirectError returns true if an error is a redirect error.
func isRedirectError ( err error ) bool {
if err == nil {
return false
}
const redirectErr string = ` invalid character '<' looking for beginning of value `
return strings . Contains ( err . Error ( ) , redirectErr )
}
2022-04-28 17:18:55 +00:00
// getNomadVersion fetches the version of Nomad running on a given server/client node ID
func ( c * OperatorDebugCommand ) getNomadVersion ( serverID string , nodeID string ) string {
if serverID == "" && nodeID == "" {
return ""
}
version := ""
if serverID != "" {
for _ , server := range c . members . Members {
// Raft v2 server
if server . Name == serverID {
version = server . Tags [ "build" ]
}
// Raft v3 server
if server . Tags [ "id" ] == serverID {
version = server . Tags [ "version" ]
}
}
}
if nodeID != "" {
for _ , node := range c . nodes {
if node . ID == nodeID {
version = node . Version
}
}
}
return version
}
// checkVersion verifies that version satisfies the constraint
func checkVersion ( version string , versionConstraint string ) error {
v , err := goversion . NewVersion ( version )
if err != nil {
return fmt . Errorf ( "error: %v" , err )
}
c , err := goversion . NewConstraint ( versionConstraint )
if err != nil {
return fmt . Errorf ( "error: %v" , err )
}
if ! c . Check ( v ) {
return nil
}
return fmt . Errorf ( "unsupported version=%s matches version filter %s" , version , minimumVersionPprofConstraint )
}