package command import ( "context" "encoding/json" "fmt" "io" "os" "strings" "sync" "time" "golang.org/x/term" wrapping "github.com/hashicorp/go-kms-wrapping" "github.com/hashicorp/vault/helper/constants" "github.com/docker/docker/pkg/ioutils" "github.com/hashicorp/consul/api" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-secure-stdlib/reloadutil" uuid "github.com/hashicorp/go-uuid" cserver "github.com/hashicorp/vault/command/server" "github.com/hashicorp/vault/helper/metricsutil" "github.com/hashicorp/vault/internalshared/configutil" "github.com/hashicorp/vault/internalshared/listenerutil" physconsul "github.com/hashicorp/vault/physical/consul" "github.com/hashicorp/vault/physical/raft" "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/sdk/version" sr "github.com/hashicorp/vault/serviceregistration" srconsul "github.com/hashicorp/vault/serviceregistration/consul" "github.com/hashicorp/vault/vault" "github.com/hashicorp/vault/vault/diagnose" "github.com/mitchellh/cli" "github.com/posener/complete" ) const OperatorDiagnoseEnableEnv = "VAULT_DIAGNOSE" const CoreConfigUninitializedErr = "Diagnose cannot attempt this step because core config could not be set." var ( _ cli.Command = (*OperatorDiagnoseCommand)(nil) _ cli.CommandAutocomplete = (*OperatorDiagnoseCommand)(nil) ) type OperatorDiagnoseCommand struct { *BaseCommand diagnose *diagnose.Session flagDebug bool flagSkips []string flagConfigs []string cleanupGuard sync.Once reloadFuncsLock *sync.RWMutex reloadFuncs *map[string][]reloadutil.ReloadFunc ServiceRegistrations map[string]sr.Factory startedCh chan struct{} // for tests reloadedCh chan struct{} // for tests skipEndEnd bool // for tests } func (c *OperatorDiagnoseCommand) Synopsis() string { return "Troubleshoot problems starting Vault" } func (c *OperatorDiagnoseCommand) Help() string { helpText := ` Usage: vault operator diagnose This command troubleshoots Vault startup issues, such as TLS configuration or auto-unseal. It should be run using the same environment variables and configuration files as the "vault server" command, so that startup problems can be accurately reproduced. Start diagnose with a configuration file: $ vault operator diagnose -config=/etc/vault/config.hcl Perform a diagnostic check while Vault is still running: $ vault operator diagnose -config=/etc/vault/config.hcl -skip=listener ` + c.Flags().Help() return strings.TrimSpace(helpText) } func (c *OperatorDiagnoseCommand) Flags() *FlagSets { set := NewFlagSets(c.UI) f := set.NewFlagSet("Command Options") f.StringSliceVar(&StringSliceVar{ Name: "config", Target: &c.flagConfigs, Completion: complete.PredictOr( complete.PredictFiles("*.hcl"), complete.PredictFiles("*.json"), complete.PredictDirs("*"), ), Usage: "Path to a Vault configuration file or directory of configuration " + "files. This flag can be specified multiple times to load multiple " + "configurations. If the path is a directory, all files which end in " + ".hcl or .json are loaded.", }) f.StringSliceVar(&StringSliceVar{ Name: "skip", Target: &c.flagSkips, Usage: "Skip the health checks named as arguments. May be 'listener', 'storage', or 'autounseal'.", }) f.BoolVar(&BoolVar{ Name: "debug", Target: &c.flagDebug, Default: false, Usage: "Dump all information collected by Diagnose.", }) f.StringVar(&StringVar{ Name: "format", Target: &c.flagFormat, Usage: "The output format", }) return set } func (c *OperatorDiagnoseCommand) AutocompleteArgs() complete.Predictor { return complete.PredictNothing } func (c *OperatorDiagnoseCommand) AutocompleteFlags() complete.Flags { return c.Flags().Completions() } const ( status_unknown = "[ ] " status_ok = "\u001b[32m[ ok ]\u001b[0m " status_failed = "\u001b[31m[failed]\u001b[0m " status_warn = "\u001b[33m[ warn ]\u001b[0m " same_line = "\u001b[F" ) func (c *OperatorDiagnoseCommand) Run(args []string) int { f := c.Flags() if err := f.Parse(args); err != nil { c.UI.Error(err.Error()) return 3 } return c.RunWithParsedFlags() } func (c *OperatorDiagnoseCommand) RunWithParsedFlags() int { if len(c.flagConfigs) == 0 { c.UI.Error("Must specify a configuration file using -config.") return 3 } if c.diagnose == nil { if c.flagFormat == "json" { c.diagnose = diagnose.New(&ioutils.NopWriter{}) } else { c.UI.Output(version.GetVersion().FullVersionNumber(true)) c.diagnose = diagnose.New(os.Stdout) } } ctx := diagnose.Context(context.Background(), c.diagnose) c.diagnose.SkipFilters = c.flagSkips err := c.offlineDiagnostics(ctx) results := c.diagnose.Finalize(ctx) if c.flagFormat == "json" { resultsJS, err := json.MarshalIndent(results, "", " ") if err != nil { fmt.Fprintf(os.Stderr, "Error marshalling results: %v.", err) return 4 } c.UI.Output(string(resultsJS)) } else { c.UI.Output("\nResults:") w, _, err := term.GetSize(0) if err == nil { results.Write(os.Stdout, w) } else { results.Write(os.Stdout, 0) } } if err != nil { return 4 } // Use a different return code switch results.Status { case diagnose.WarningStatus: return 2 case diagnose.ErrorStatus: return 1 } return 0 } func (c *OperatorDiagnoseCommand) offlineDiagnostics(ctx context.Context) error { rloadFuncs := make(map[string][]reloadutil.ReloadFunc) server := &ServerCommand{ // TODO: set up a different one? // In particular, a UI instance that won't output? BaseCommand: c.BaseCommand, // TODO: refactor to a common place? AuditBackends: auditBackends, CredentialBackends: credentialBackends, LogicalBackends: logicalBackends, PhysicalBackends: physicalBackends, ServiceRegistrations: serviceRegistrations, // TODO: other ServerCommand options? logger: log.NewInterceptLogger(&log.LoggerOptions{ Level: log.Off, }), allLoggers: []log.Logger{}, reloadFuncs: &rloadFuncs, reloadFuncsLock: new(sync.RWMutex), } ctx, span := diagnose.StartSpan(ctx, "Vault Diagnose") defer span.End() // OS Specific checks diagnose.OSChecks(ctx) var config *cserver.Config diagnose.Test(ctx, "Parse Configuration", func(ctx context.Context) (err error) { server.flagConfigs = c.flagConfigs var configErrors []configutil.ConfigError config, configErrors, err = server.parseConfig() if err != nil { return fmt.Errorf("Could not parse configuration: %w.", err) } for _, ce := range configErrors { diagnose.Warn(ctx, diagnose.CapitalizeFirstLetter(ce.String())+".") } diagnose.Success(ctx, "Vault configuration syntax is ok.") return nil }) if config == nil { return fmt.Errorf("No vault server configuration found.") } var metricSink *metricsutil.ClusterMetricSink var metricsHelper *metricsutil.MetricsHelper var backend *physical.Backend diagnose.Test(ctx, "Check Storage", func(ctx context.Context) error { // Ensure that there is a storage stanza if config.Storage == nil { diagnose.Advise(ctx, "To learn how to specify a storage backend, see the Vault server configuration documentation.") return fmt.Errorf("No storage stanza in Vault server configuration.") } diagnose.Test(ctx, "Create Storage Backend", func(ctx context.Context) error { b, err := server.setupStorage(config) if err != nil { return err } if b == nil { diagnose.Advise(ctx, "To learn how to specify a storage backend, see the Vault server configuration documentation.") return fmt.Errorf("Storage backend could not be initialized.") } backend = &b return nil }) if backend == nil { diagnose.Fail(ctx, "Diagnose could not initialize storage backend.") span.End() return fmt.Errorf("Diagnose could not initialize storage backend.") } // Check for raft quorum status if config.Storage.Type == storageTypeRaft { path := os.Getenv(raft.EnvVaultRaftPath) if path == "" { path, ok := config.Storage.Config["path"] if !ok { diagnose.SpotError(ctx, "Check Raft Folder Permissions", fmt.Errorf("Storage folder path is required.")) } diagnose.RaftFileChecks(ctx, path) } diagnose.RaftStorageQuorum(ctx, (*backend).(*raft.RaftBackend)) } // Consul storage checks if config.Storage != nil && config.Storage.Type == storageTypeConsul { diagnose.Test(ctx, "Check Consul TLS", func(ctx context.Context) error { err := physconsul.SetupSecureTLS(ctx, api.DefaultConfig(), config.Storage.Config, server.logger, true) if err != nil { return err } return nil }) diagnose.Test(ctx, "Check Consul Direct Storage Access", func(ctx context.Context) error { dirAccess := diagnose.ConsulDirectAccess(config.Storage.Config) if dirAccess != "" { diagnose.Warn(ctx, dirAccess) } if dirAccess == diagnose.DirAccessErr { diagnose.Advise(ctx, diagnose.DirAccessAdvice) } return nil }) } // Attempt to use storage backend if !c.skipEndEnd && config.Storage.Type != storageTypeRaft { diagnose.Test(ctx, "Check Storage Access", diagnose.WithTimeout(30*time.Second, func(ctx context.Context) error { maxDurationCrudOperation := "write" maxDuration := time.Duration(0) uuidSuffix, err := uuid.GenerateUUID() if err != nil { return err } uuid := "diagnose/latency/" + uuidSuffix dur, err := diagnose.EndToEndLatencyCheckWrite(ctx, uuid, *backend) if err != nil { return err } maxDuration = dur dur, err = diagnose.EndToEndLatencyCheckRead(ctx, uuid, *backend) if err != nil { return err } if dur > maxDuration { maxDuration = dur maxDurationCrudOperation = "read" } dur, err = diagnose.EndToEndLatencyCheckDelete(ctx, uuid, *backend) if err != nil { return err } if dur > maxDuration { maxDuration = dur maxDurationCrudOperation = "delete" } if maxDuration > time.Duration(0) { diagnose.Warn(ctx, diagnose.LatencyWarning+fmt.Sprintf("duration: %s, operation: %s", maxDuration, maxDurationCrudOperation)) } return nil })) } return nil }) // Return from top-level span when backend is nil if backend == nil { return fmt.Errorf("Diagnose could not initialize storage backend.") } var configSR sr.ServiceRegistration diagnose.Test(ctx, "Check Service Discovery", func(ctx context.Context) error { if config.ServiceRegistration == nil || config.ServiceRegistration.Config == nil { diagnose.Skipped(ctx, "No service registration configured.") return nil } srConfig := config.ServiceRegistration.Config diagnose.Test(ctx, "Check Consul Service Discovery TLS", func(ctx context.Context) error { // SetupSecureTLS for service discovery uses the same cert and key to set up physical // storage. See the consul package in physical for details. err := srconsul.SetupSecureTLS(ctx, api.DefaultConfig(), srConfig, server.logger, true) if err != nil { return err } return nil }) if config.ServiceRegistration != nil && config.ServiceRegistration.Type == "consul" { diagnose.Test(ctx, "Check Consul Direct Service Discovery", func(ctx context.Context) error { dirAccess := diagnose.ConsulDirectAccess(config.ServiceRegistration.Config) if dirAccess != "" { diagnose.Warn(ctx, dirAccess) } if dirAccess == diagnose.DirAccessErr { diagnose.Advise(ctx, diagnose.DirAccessAdvice) } return nil }) } return nil }) sealcontext, sealspan := diagnose.StartSpan(ctx, "Create Vault Server Configuration Seals") var seals []vault.Seal var sealConfigError error barrierSeal, barrierWrapper, unwrapSeal, seals, sealConfigError, err := setSeal(server, config, make([]string, 0), make(map[string]string)) // Check error here if err != nil { diagnose.Advise(ctx, "For assistance with the seal stanza, see the Vault configuration documentation.") diagnose.Fail(sealcontext, fmt.Sprintf("Seal creation resulted in the following error: %s.", err.Error())) goto SEALFAIL } if sealConfigError != nil { diagnose.Fail(sealcontext, "Seal could not be configured: seals may already be initialized.") goto SEALFAIL } if seals != nil { for _, seal := range seals { // There is always one nil seal. We need to skip it so we don't start an empty Finalize-Seal-Shamir // section. if seal == nil { continue } // Ensure that the seal finalizer is called, even if using verify-only defer func(seal *vault.Seal) { sealType := diagnose.CapitalizeFirstLetter((*seal).BarrierType()) finalizeSealContext, finalizeSealSpan := diagnose.StartSpan(ctx, "Finalize "+sealType+" Seal") err = (*seal).Finalize(finalizeSealContext) if err != nil { diagnose.Fail(finalizeSealContext, "Error finalizing seal.") diagnose.Advise(finalizeSealContext, "This likely means that the barrier is still in use; therefore, finalizing the seal timed out.") finalizeSealSpan.End() } finalizeSealSpan.End() }(&seal) } } if barrierSeal == nil { diagnose.Fail(sealcontext, "Could not create barrier seal. No error was generated, but it is likely that the seal stanza is misconfigured. For guidance, see Vault's configuration documentation on the seal stanza.") } SEALFAIL: sealspan.End() diagnose.Test(ctx, "Check Transit Seal TLS", func(ctx context.Context) error { var checkSealTransit bool for _, seal := range config.Seals { if seal.Type == "transit" { checkSealTransit = true tlsSkipVerify, _ := seal.Config["tls_skip_verify"] if tlsSkipVerify == "true" { diagnose.Warn(ctx, "TLS verification is skipped. This is highly discouraged and decreases the security of data transmissions to and from the Vault server.") return nil } // Checking tls_client_cert and tls_client_key tlsClientCert, ok := seal.Config["tls_client_cert"] if !ok { diagnose.Warn(ctx, "Missing tls_client_cert in the seal configuration.") return nil } tlsClientKey, ok := seal.Config["tls_client_key"] if !ok { diagnose.Warn(ctx, "Missing tls_client_key in the seal configuration.") return nil } _, err := diagnose.TLSFileChecks(tlsClientCert, tlsClientKey) if err != nil { return fmt.Errorf("The TLS certificate and key configured through the tls_client_cert and tls_client_key fields of the transit seal configuration are invalid: %w.", err) } // checking tls_ca_cert tlsCACert, ok := seal.Config["tls_ca_cert"] if !ok { diagnose.Warn(ctx, "Missing tls_ca_cert in the seal configuration.") return nil } warnings, err := diagnose.TLSCAFileCheck(tlsCACert) if len(warnings) != 0 { for _, warning := range warnings { diagnose.Warn(ctx, warning) } } if err != nil { return fmt.Errorf("The TLS CA certificate configured through the tls_ca_cert field of the transit seal configuration is invalid: %w.", err) } } } if !checkSealTransit { diagnose.Skipped(ctx, "No transit seal found in seal configuration.") } return nil }) var coreConfig vault.CoreConfig diagnose.Test(ctx, "Create Core Configuration", func(ctx context.Context) error { var secureRandomReader io.Reader // prepare a secure random reader for core randReaderTestName := "Initialize Randomness for Core" secureRandomReader, err = configutil.CreateSecureRandomReaderFunc(config.SharedConfig, barrierWrapper) if err != nil { return diagnose.SpotError(ctx, randReaderTestName, fmt.Errorf("Could not initialize randomness for core: %w.", err)) } diagnose.SpotOk(ctx, randReaderTestName, "") coreConfig = createCoreConfig(server, config, *backend, configSR, barrierSeal, unwrapSeal, metricsHelper, metricSink, secureRandomReader) return nil }) var disableClustering bool diagnose.Test(ctx, "HA Storage", func(ctx context.Context) error { diagnose.Test(ctx, "Create HA Storage Backend", func(ctx context.Context) error { // Initialize the separate HA storage backend, if it exists disableClustering, err = initHaBackend(server, config, &coreConfig, *backend) if err != nil { return err } return nil }) diagnose.Test(ctx, "Check HA Consul Direct Storage Access", func(ctx context.Context) error { if config.HAStorage == nil { diagnose.Skipped(ctx, "No HA storage stanza is configured.") } else { dirAccess := diagnose.ConsulDirectAccess(config.HAStorage.Config) if dirAccess != "" { diagnose.Warn(ctx, dirAccess) } if dirAccess == diagnose.DirAccessErr { diagnose.Advise(ctx, diagnose.DirAccessAdvice) } } return nil }) if config.HAStorage != nil && config.HAStorage.Type == storageTypeConsul { diagnose.Test(ctx, "Check Consul TLS", func(ctx context.Context) error { err = physconsul.SetupSecureTLS(ctx, api.DefaultConfig(), config.HAStorage.Config, server.logger, true) if err != nil { return err } return nil }) } return nil }) // Determine the redirect address from environment variables err = determineRedirectAddr(server, &coreConfig, config) if err != nil { return diagnose.SpotError(ctx, "Determine Redirect Address", fmt.Errorf("Redirect Address could not be determined: %w.", err)) } diagnose.SpotOk(ctx, "Determine Redirect Address", "") err = findClusterAddress(server, &coreConfig, config, disableClustering) if err != nil { return diagnose.SpotError(ctx, "Check Cluster Address", fmt.Errorf("Cluster Address could not be determined or was invalid: %w.", err), diagnose.Advice("Please check that the API and Cluster addresses are different, and that the API, Cluster and Redirect addresses have both a host and port.")) } diagnose.SpotOk(ctx, "Check Cluster Address", "Cluster address is logically valid and can be found.") var vaultCore *vault.Core // Run all the checks that are utilized when initializing a core object // without actually calling core.Init. These are in the init-core section // as they are runtime checks. diagnose.Test(ctx, "Check Core Creation", func(ctx context.Context) error { var newCoreError error if coreConfig.RawConfig == nil { return fmt.Errorf(CoreConfigUninitializedErr) } core, newCoreError := vault.CreateCore(&coreConfig) if newCoreError != nil { if vault.IsFatalError(newCoreError) { return fmt.Errorf("Error initializing core: %s.", newCoreError) } diagnose.Warn(ctx, wrapAtLength( "A non-fatal error occurred during initialization. Please check the logs for more information.")) } else { vaultCore = core } return nil }) if vaultCore == nil { return fmt.Errorf("Diagnose could not initialize the Vault core from the Vault server configuration.") } licenseCtx, licenseSpan := diagnose.StartSpan(ctx, "Check For Autoloaded License") // If we are not in enterprise, return from the check if !constants.IsEnterprise { diagnose.Skipped(licenseCtx, "License check will not run on OSS Vault.") } else { // Load License from environment variables. These take precedence over the // configured license. if envLicensePath := os.Getenv(EnvVaultLicensePath); envLicensePath != "" { coreConfig.LicensePath = envLicensePath } if envLicense := os.Getenv(EnvVaultLicense); envLicense != "" { coreConfig.License = envLicense } vault.DiagnoseCheckLicense(licenseCtx, vaultCore, coreConfig, false) } licenseSpan.End() var lns []listenerutil.Listener diagnose.Test(ctx, "Start Listeners", func(ctx context.Context) error { disableClustering := config.HAStorage != nil && config.HAStorage.DisableClustering infoKeys := make([]string, 0, 10) info := make(map[string]string) var listeners []listenerutil.Listener var status int diagnose.ListenerChecks(ctx, config.Listeners) diagnose.Test(ctx, "Create Listeners", func(ctx context.Context) error { status, listeners, _, err = server.InitListeners(config, disableClustering, &infoKeys, &info) if status != 0 { return err } return nil }) lns = listeners // Make sure we close all listeners from this point on listenerCloseFunc := func() { for _, ln := range lns { ln.Listener.Close() } } c.cleanupGuard.Do(listenerCloseFunc) return nil }) // TODO: Diagnose logging configuration // The unseal diagnose check will simply attempt to use the barrier to encrypt and // decrypt a mock value. It will not call runUnseal. diagnose.Test(ctx, "Check Autounseal Encryption", diagnose.WithTimeout(30*time.Second, func(ctx context.Context) error { if barrierSeal == nil { return fmt.Errorf("Diagnose could not create a barrier seal object.") } if barrierSeal.BarrierType() == wrapping.Shamir { diagnose.Skipped(ctx, "Skipping barrier encryption test. Only supported for auto-unseal.") return nil } barrierUUID, err := uuid.GenerateUUID() if err != nil { return fmt.Errorf("Diagnose could not create unique UUID for unsealing.") } barrierEncValue := "diagnose-" + barrierUUID ciphertext, err := barrierWrapper.Encrypt(ctx, []byte(barrierEncValue), nil) if err != nil { return fmt.Errorf("Error encrypting with seal barrier: %w.", err) } plaintext, err := barrierWrapper.Decrypt(ctx, ciphertext, nil) if err != nil { return fmt.Errorf("Error decrypting with seal barrier: %w", err) } if string(plaintext) != barrierEncValue { return fmt.Errorf("Barrier returned incorrect decrypted value for mock data.") } return nil })) // The following block contains static checks that are run during the // startHttpServers portion of server run. In other words, they are static // checks during resource creation. Currently there is nothing important in this // diagnose check. For now it is a placeholder for any checks that will be done // before server run. diagnose.Test(ctx, "Check Server Before Runtime", func(ctx context.Context) error { for _, ln := range lns { if ln.Config == nil { return fmt.Errorf("Found no listener config after parsing the Vault configuration.") } } return nil }) return nil }