open-vault/command/operator_diagnose.go

package command

import (
	"context"
	"encoding/json"
	"fmt"
	"io"
	"os"
	"strings"
	"sync"
	"time"

	"golang.org/x/term"

	wrapping "github.com/hashicorp/go-kms-wrapping"
	"github.com/hashicorp/vault/helper/constants"

	"github.com/docker/docker/pkg/ioutils"
	"github.com/hashicorp/consul/api"
	log "github.com/hashicorp/go-hclog"
	uuid "github.com/hashicorp/go-uuid"
	cserver "github.com/hashicorp/vault/command/server"
	"github.com/hashicorp/vault/helper/metricsutil"
	"github.com/hashicorp/vault/internalshared/configutil"
	"github.com/hashicorp/vault/internalshared/listenerutil"
	"github.com/hashicorp/vault/internalshared/reloadutil"
	physconsul "github.com/hashicorp/vault/physical/consul"
	"github.com/hashicorp/vault/physical/raft"
	"github.com/hashicorp/vault/sdk/physical"
	"github.com/hashicorp/vault/sdk/version"
	sr "github.com/hashicorp/vault/serviceregistration"
	srconsul "github.com/hashicorp/vault/serviceregistration/consul"
	"github.com/hashicorp/vault/vault"
	"github.com/hashicorp/vault/vault/diagnose"
	"github.com/mitchellh/cli"
	"github.com/posener/complete"
)

const OperatorDiagnoseEnableEnv = "VAULT_DIAGNOSE"

const CoreConfigUninitializedErr = "Diagnose cannot attempt this step because core config could not be set."

var (
	_ cli.Command             = (*OperatorDiagnoseCommand)(nil)
	_ cli.CommandAutocomplete = (*OperatorDiagnoseCommand)(nil)
)

type OperatorDiagnoseCommand struct {
	*BaseCommand
	diagnose *diagnose.Session

	flagDebug    bool
	flagSkips    []string
	flagConfigs  []string
	cleanupGuard sync.Once

	reloadFuncsLock      *sync.RWMutex
	reloadFuncs          *map[string][]reloadutil.ReloadFunc
	ServiceRegistrations map[string]sr.Factory
	startedCh            chan struct{} // for tests
	reloadedCh           chan struct{} // for tests
	skipEndEnd           bool          // for tests
}

func (c *OperatorDiagnoseCommand) Synopsis() string {
	return "Troubleshoot problems starting Vault"
}

func (c *OperatorDiagnoseCommand) Help() string {
	helpText := `
Usage: vault operator diagnose

  This command troubleshoots Vault startup issues, such as TLS configuration or
  auto-unseal. It should be run using the same environment variables and configuration
  files as the "vault server" command, so that startup problems can be accurately
  reproduced.

  Start diagnose with a configuration file:

     $ vault operator diagnose -config=/etc/vault/config.hcl

  Perform a diagnostic check while Vault is still running:

     $ vault operator diagnose -config=/etc/vault/config.hcl -skip=listener

` + c.Flags().Help()
	return strings.TrimSpace(helpText)
}

func (c *OperatorDiagnoseCommand) Flags() *FlagSets {
	set := NewFlagSets(c.UI)
	f := set.NewFlagSet("Command Options")

	f.StringSliceVar(&StringSliceVar{
		Name:   "config",
		Target: &c.flagConfigs,
		Completion: complete.PredictOr(
			complete.PredictFiles("*.hcl"),
			complete.PredictFiles("*.json"),
			complete.PredictDirs("*"),
		),
		Usage: "Path to a Vault configuration file or directory of configuration " +
			"files. This flag can be specified multiple times to load multiple " +
			"configurations. If the path is a directory, all files which end in " +
			".hcl or .json are loaded.",
	})

	f.StringSliceVar(&StringSliceVar{
		Name:   "skip",
		Target: &c.flagSkips,
		Usage:  "Skip the health checks named as arguments. May be 'listener', 'storage', or 'autounseal'.",
	})

	f.BoolVar(&BoolVar{
		Name:    "debug",
		Target:  &c.flagDebug,
		Default: false,
		Usage:   "Dump all information collected by Diagnose.",
	})

	f.StringVar(&StringVar{
		Name:   "format",
		Target: &c.flagFormat,
		Usage:  "The output format",
	})
	return set
}

func (c *OperatorDiagnoseCommand) AutocompleteArgs() complete.Predictor {
	return complete.PredictNothing
}

func (c *OperatorDiagnoseCommand) AutocompleteFlags() complete.Flags {
	return c.Flags().Completions()
}

const (
	status_unknown = "[      ] "
	status_ok      = "\u001b[32m[  ok  ]\u001b[0m "
	status_failed  = "\u001b[31m[failed]\u001b[0m "
	status_warn    = "\u001b[33m[ warn ]\u001b[0m "
	same_line      = "\u001b[F"
)

func (c *OperatorDiagnoseCommand) Run(args []string) int {
	f := c.Flags()
	if err := f.Parse(args); err != nil {
		c.UI.Error(err.Error())
		return 3
	}
	return c.RunWithParsedFlags()
}

func (c *OperatorDiagnoseCommand) RunWithParsedFlags() int {

	if len(c.flagConfigs) == 0 {
		c.UI.Error("Must specify a configuration file using -config.")
		return 3
	}

	if c.diagnose == nil {
		if c.flagFormat == "json" {
			c.diagnose = diagnose.New(&ioutils.NopWriter{})
		} else {
			c.UI.Output(version.GetVersion().FullVersionNumber(true))
			c.diagnose = diagnose.New(os.Stdout)
		}
	}
	ctx := diagnose.Context(context.Background(), c.diagnose)
	c.diagnose.SkipFilters = c.flagSkips
	err := c.offlineDiagnostics(ctx)

	results := c.diagnose.Finalize(ctx)
	if c.flagFormat == "json" {
		resultsJS, err := json.MarshalIndent(results, "", "  ")
		if err != nil {
			fmt.Fprintf(os.Stderr, "Error marshalling results: %v.", err)
			return 4
		}
		c.UI.Output(string(resultsJS))
	} else {
		c.UI.Output("\nResults:")
		w, _, err := term.GetSize(0)
		if err == nil {
			results.Write(os.Stdout, w)
		} else {
			results.Write(os.Stdout, 0)
		}
	}

	if err != nil {
		return 4
	}
	// Use a different return code
	switch results.Status {
	case diagnose.WarningStatus:
		return 2
	case diagnose.ErrorStatus:
		return 1
	}
	return 0
}

func (c *OperatorDiagnoseCommand) offlineDiagnostics(ctx context.Context) error {
	rloadFuncs := make(map[string][]reloadutil.ReloadFunc)
	server := &ServerCommand{
		// TODO: set up a different one?
		// In particular, a UI instance that won't output?
		BaseCommand: c.BaseCommand,

		// TODO: refactor to a common place?
		AuditBackends:        auditBackends,
		CredentialBackends:   credentialBackends,
		LogicalBackends:      logicalBackends,
		PhysicalBackends:     physicalBackends,
		ServiceRegistrations: serviceRegistrations,

		// TODO: other ServerCommand options?

		logger: log.NewInterceptLogger(&log.LoggerOptions{
			Level: log.Off,
		}),
		allLoggers:      []log.Logger{},
		reloadFuncs:     &rloadFuncs,
		reloadFuncsLock: new(sync.RWMutex),
	}

	ctx, span := diagnose.StartSpan(ctx, "Vault Diagnose")
	defer span.End()

	// OS Specific checks
	diagnose.OSChecks(ctx)

	var config *cserver.Config

	diagnose.Test(ctx, "Parse Configuration", func(ctx context.Context) (err error) {
		server.flagConfigs = c.flagConfigs
		var configErrors []configutil.ConfigError
		config, configErrors, err = server.parseConfig()
		if err != nil {
			return fmt.Errorf("Could not parse configuration: %w.", err)
		}
		for _, ce := range configErrors {
			diagnose.Warn(ctx, diagnose.CapitalizeFirstLetter(ce.String())+".")
		}
		diagnose.Success(ctx, "Vault configuration syntax is ok.")
		return nil
	})
	if config == nil {
		return fmt.Errorf("No vault server configuration found.")
	}

	var metricSink *metricsutil.ClusterMetricSink
	var metricsHelper *metricsutil.MetricsHelper

	var backend *physical.Backend
	diagnose.Test(ctx, "Check Storage", func(ctx context.Context) error {

		// Ensure that there is a storage stanza
		if config.Storage == nil {
			diagnose.Advise(ctx, "To learn how to specify a storage backend, see the Vault server configuration documentation.")
			return fmt.Errorf("No storage stanza in Vault server configuration.")
		}

		diagnose.Test(ctx, "Create Storage Backend", func(ctx context.Context) error {
			b, err := server.setupStorage(config)
			if err != nil {
				return err
			}
			if b == nil {
				diagnose.Advise(ctx, "To learn how to specify a storage backend, see the Vault server configuration documentation.")
				return fmt.Errorf("Storage backend could not be initialized.")
			}
			backend = &b
			return nil
		})

		if backend == nil {
			diagnose.Fail(ctx, "Diagnose could not initialize storage backend.")
			span.End()
			return fmt.Errorf("Diagnose could not initialize storage backend.")
		}

		// Check for raft quorum status
		if config.Storage.Type == storageTypeRaft {
			path := os.Getenv(raft.EnvVaultRaftPath)
			if path == "" {
				path, ok := config.Storage.Config["path"]
				if !ok {
					diagnose.SpotError(ctx, "Check Raft Folder Permissions", fmt.Errorf("Storage folder path is required."))
				}
				diagnose.RaftFileChecks(ctx, path)
			}
			diagnose.RaftStorageQuorum(ctx, (*backend).(*raft.RaftBackend))
		}

		// Consul storage checks
		if config.Storage != nil && config.Storage.Type == storageTypeConsul {
			diagnose.Test(ctx, "Check Consul TLS", func(ctx context.Context) error {
				err := physconsul.SetupSecureTLS(ctx, api.DefaultConfig(), config.Storage.Config, server.logger, true)
				if err != nil {
					return err
				}
				return nil
			})

			diagnose.Test(ctx, "Check Consul Direct Storage Access", func(ctx context.Context) error {
				dirAccess := diagnose.ConsulDirectAccess(config.Storage.Config)
				if dirAccess != "" {
					diagnose.Warn(ctx, dirAccess)
				}
				if dirAccess == diagnose.DirAccessErr {
					diagnose.Advise(ctx, diagnose.DirAccessAdvice)
				}
				return nil
			})
		}

		// Attempt to use storage backend
		if !c.skipEndEnd && config.Storage.Type != storageTypeRaft {
			diagnose.Test(ctx, "Check Storage Access", diagnose.WithTimeout(30*time.Second, func(ctx context.Context) error {
				maxDurationCrudOperation := "write"
				maxDuration := time.Duration(0)
				uuidSuffix, err := uuid.GenerateUUID()
				if err != nil {
					return err
				}
				uuid := "diagnose/latency/" + uuidSuffix
				dur, err := diagnose.EndToEndLatencyCheckWrite(ctx, uuid, *backend)
				if err != nil {
					return err
				}
				maxDuration = dur
				dur, err = diagnose.EndToEndLatencyCheckRead(ctx, uuid, *backend)
				if err != nil {
					return err
				}
				if dur > maxDuration {
					maxDuration = dur
					maxDurationCrudOperation = "read"
				}
				dur, err = diagnose.EndToEndLatencyCheckDelete(ctx, uuid, *backend)
				if err != nil {
					return err
				}
				if dur > maxDuration {
					maxDuration = dur
					maxDurationCrudOperation = "delete"
				}

				if maxDuration > time.Duration(0) {
					diagnose.Warn(ctx, diagnose.LatencyWarning+fmt.Sprintf("duration: %s, operation: %s", maxDuration, maxDurationCrudOperation))
				}
				return nil
			}))
		}
		return nil
	})

	// Return from top-level span when backend is nil
	if backend == nil {
		return fmt.Errorf("Diagnose could not initialize storage backend.")
	}

	var configSR sr.ServiceRegistration
	diagnose.Test(ctx, "Check Service Discovery", func(ctx context.Context) error {
		if config.ServiceRegistration == nil || config.ServiceRegistration.Config == nil {
			diagnose.Skipped(ctx, "No service registration configured.")
			return nil
		}
		srConfig := config.ServiceRegistration.Config

		diagnose.Test(ctx, "Check Consul Service Discovery TLS", func(ctx context.Context) error {
			// SetupSecureTLS for service discovery uses the same cert and key to set up physical
			// storage. See the consul package in physical for details.
			err := srconsul.SetupSecureTLS(ctx, api.DefaultConfig(), srConfig, server.logger, true)
			if err != nil {
				return err
			}
			return nil
		})

		if config.ServiceRegistration != nil && config.ServiceRegistration.Type == "consul" {
			diagnose.Test(ctx, "Check Consul Direct Service Discovery", func(ctx context.Context) error {
				dirAccess := diagnose.ConsulDirectAccess(config.ServiceRegistration.Config)
				if dirAccess != "" {
					diagnose.Warn(ctx, dirAccess)
				}
				if dirAccess == diagnose.DirAccessErr {
					diagnose.Advise(ctx, diagnose.DirAccessAdvice)
				}
				return nil
			})
		}
		return nil
	})

	sealcontext, sealspan := diagnose.StartSpan(ctx, "Create Vault Server Configuration Seals")
	var seals []vault.Seal
	var sealConfigError error

	barrierSeal, barrierWrapper, unwrapSeal, seals, sealConfigError, err := setSeal(server, config, make([]string, 0), make(map[string]string))

	// Check error here
	if err != nil {
		diagnose.Advise(ctx, "For assistance with the seal stanza, see the Vault configuration documentation.")
		diagnose.Fail(sealcontext, fmt.Sprintf("Seal creation resulted in the following error: %s.", err.Error()))
		goto SEALFAIL
	}
	if sealConfigError != nil {
		diagnose.Fail(sealcontext, "Seal could not be configured: seals may already be initialized.")
		goto SEALFAIL
	}

	if seals != nil {
		for _, seal := range seals {
			// There is always one nil seal. We need to skip it so we don't start an empty Finalize-Seal-Shamir
			// section.
			if seal == nil {
				continue
			}
			// Ensure that the seal finalizer is called, even if using verify-only
			defer func(seal *vault.Seal) {
				sealType := diagnose.CapitalizeFirstLetter((*seal).BarrierType())
				finalizeSealContext, finalizeSealSpan := diagnose.StartSpan(ctx, "Finalize "+sealType+" Seal")
				err = (*seal).Finalize(finalizeSealContext)
				if err != nil {
					diagnose.Fail(finalizeSealContext, "Error finalizing seal.")
					diagnose.Advise(finalizeSealContext, "This likely means that the barrier is still in use; therefore, finalizing the seal timed out.")
					finalizeSealSpan.End()
				}
				finalizeSealSpan.End()
			}(&seal)
		}
	}

	if barrierSeal == nil {
		diagnose.Fail(sealcontext, "Could not create barrier seal. No error was generated, but it is likely that the seal stanza is misconfigured. For guidance, see Vault's configuration documentation on the seal stanza.")
	}

SEALFAIL:
	sealspan.End()

	diagnose.Test(ctx, "Check Transit Seal TLS", func(ctx context.Context) error {
		var checkSealTransit bool
		for _, seal := range config.Seals {
			if seal.Type == "transit" {
				checkSealTransit = true

				tlsSkipVerify, _ := seal.Config["tls_skip_verify"]
				if tlsSkipVerify == "true" {
					diagnose.Warn(ctx, "TLS verification is skipped. This is highly discouraged and decreases the security of data transmissions to and from the Vault server.")
					return nil
				}

				// Checking tls_client_cert and tls_client_key
				tlsClientCert, ok := seal.Config["tls_client_cert"]
				if !ok {
					diagnose.Warn(ctx, "Missing tls_client_cert in the seal configuration.")
					return nil
				}
				tlsClientKey, ok := seal.Config["tls_client_key"]
				if !ok {
					diagnose.Warn(ctx, "Missing tls_client_key in the seal configuration.")
					return nil
				}
				_, err := diagnose.TLSFileChecks(tlsClientCert, tlsClientKey)
				if err != nil {
					return fmt.Errorf("The TLS certificate and key configured through the tls_client_cert and tls_client_key fields of the transit seal configuration are invalid: %w.", err)
				}

				// checking tls_ca_cert
				tlsCACert, ok := seal.Config["tls_ca_cert"]
				if !ok {
					diagnose.Warn(ctx, "Missing tls_ca_cert in the seal configuration.")
					return nil
				}
				warnings, err := diagnose.TLSCAFileCheck(tlsCACert)
				if len(warnings) != 0 {
					for _, warning := range warnings {
						diagnose.Warn(ctx, warning)
					}
				}
				if err != nil {
					return fmt.Errorf("The TLS CA certificate configured through the tls_ca_cert field of the transit seal configuration is invalid: %w.", err)
				}
			}
		}
		if !checkSealTransit {
			diagnose.Skipped(ctx, "No transit seal found in seal configuration.")
		}
		return nil
	})

	var coreConfig vault.CoreConfig
	diagnose.Test(ctx, "Create Core Configuration", func(ctx context.Context) error {
		var secureRandomReader io.Reader
		// prepare a secure random reader for core
		randReaderTestName := "Initialize Randomness for Core"
		secureRandomReader, err = configutil.CreateSecureRandomReaderFunc(config.SharedConfig, barrierWrapper)
		if err != nil {
			return diagnose.SpotError(ctx, randReaderTestName, fmt.Errorf("Could not initialize randomness for core: %w.", err))
		}
		diagnose.SpotOk(ctx, randReaderTestName, "")
		coreConfig = createCoreConfig(server, config, *backend, configSR, barrierSeal, unwrapSeal, metricsHelper, metricSink, secureRandomReader)
		return nil
	})

	var disableClustering bool
	diagnose.Test(ctx, "HA Storage", func(ctx context.Context) error {
		diagnose.Test(ctx, "Create HA Storage Backend", func(ctx context.Context) error {
			// Initialize the separate HA storage backend, if it exists
			disableClustering, err = initHaBackend(server, config, &coreConfig, *backend)
			if err != nil {
				return err
			}
			return nil
		})

		diagnose.Test(ctx, "Check HA Consul Direct Storage Access", func(ctx context.Context) error {
			if config.HAStorage == nil {
				diagnose.Skipped(ctx, "No HA storage stanza is configured.")
			} else {
				dirAccess := diagnose.ConsulDirectAccess(config.HAStorage.Config)
				if dirAccess != "" {
					diagnose.Warn(ctx, dirAccess)
				}
				if dirAccess == diagnose.DirAccessErr {
					diagnose.Advise(ctx, diagnose.DirAccessAdvice)
				}
			}
			return nil
		})
		if config.HAStorage != nil && config.HAStorage.Type == storageTypeConsul {
			diagnose.Test(ctx, "Check Consul TLS", func(ctx context.Context) error {
				err = physconsul.SetupSecureTLS(ctx, api.DefaultConfig(), config.HAStorage.Config, server.logger, true)
				if err != nil {
					return err
				}
				return nil
			})
		}
		return nil
	})

	// Determine the redirect address from environment variables
	err = determineRedirectAddr(server, &coreConfig, config)
	if err != nil {
		return diagnose.SpotError(ctx, "Determine Redirect Address", fmt.Errorf("Redirect Address could not be determined: %w.", err))
	}
	diagnose.SpotOk(ctx, "Determine Redirect Address", "")

	err = findClusterAddress(server, &coreConfig, config, disableClustering)
	if err != nil {
		diagnose.Advise(ctx, "Please check that the API and Cluster addresses are different, and that the API, Cluster and Redirect addresses have both a host and port.")
		return diagnose.SpotError(ctx, "Check Cluster Address", fmt.Errorf("Cluster Address could not be determined or was invalid: %w.", err))
	}
	diagnose.SpotOk(ctx, "Check Cluster Address", "Cluster address is logically valid and can be found.")

	var vaultCore *vault.Core

	// Run all the checks that are utilized when initializing a core object
	// without actually calling core.Init. These are in the init-core section
	// as they are runtime checks.
	diagnose.Test(ctx, "Check Core Creation", func(ctx context.Context) error {
		var newCoreError error
		if coreConfig.RawConfig == nil {
			return fmt.Errorf(CoreConfigUninitializedErr)
		}
		core, newCoreError := vault.CreateCore(&coreConfig)
		if newCoreError != nil {
			if vault.IsFatalError(newCoreError) {
				return fmt.Errorf("Error initializing core: %s.", newCoreError)
			}
			diagnose.Warn(ctx, wrapAtLength(
				"A non-fatal error occurred during initialization. Please check the logs for more information."))
		} else {
			vaultCore = core
		}
		return nil
	})

	if vaultCore == nil {
		return fmt.Errorf("Diagnose could not initialize the Vault core from the Vault server configuration.")
	}

	licenseCtx, licenseSpan := diagnose.StartSpan(ctx, "Check For Autoloaded License")
	// If we are not in enterprise, return from the check
	if !constants.IsEnterprise {
		diagnose.Skipped(licenseCtx, "License check will not run on OSS Vault.")
	} else {
		// Load License from environment variables. These take precedence over the
		// configured license.
		if envLicensePath := os.Getenv(EnvVaultLicensePath); envLicensePath != "" {
			coreConfig.LicensePath = envLicensePath
		}
		if envLicense := os.Getenv(EnvVaultLicense); envLicense != "" {
			coreConfig.License = envLicense
		}
		vault.DiagnoseCheckLicense(licenseCtx, vaultCore, coreConfig)
	}
	licenseSpan.End()

	var lns []listenerutil.Listener
	diagnose.Test(ctx, "Start Listeners", func(ctx context.Context) error {
		disableClustering := config.HAStorage != nil && config.HAStorage.DisableClustering
		infoKeys := make([]string, 0, 10)
		info := make(map[string]string)
		var listeners []listenerutil.Listener
		var status int

		diagnose.ListenerChecks(ctx, config.Listeners)

		diagnose.Test(ctx, "Create Listeners", func(ctx context.Context) error {
			status, listeners, _, err = server.InitListeners(config, disableClustering, &infoKeys, &info)
			if status != 0 {
				return err
			}
			return nil
		})

		lns = listeners

		// Make sure we close all listeners from this point on
		listenerCloseFunc := func() {
			for _, ln := range lns {
				ln.Listener.Close()
			}
		}

		c.cleanupGuard.Do(listenerCloseFunc)

		return nil
	})

	// TODO: Diagnose logging configuration

	// The unseal diagnose check will simply attempt to use the barrier to encrypt and
	// decrypt a mock value. It will not call runUnseal.
	diagnose.Test(ctx, "Check Autounseal Encryption", diagnose.WithTimeout(30*time.Second, func(ctx context.Context) error {
		if barrierSeal == nil {
			return fmt.Errorf("Diagnose could not create a barrier seal object.")
		}
		if barrierSeal.BarrierType() == wrapping.Shamir {
			diagnose.Skipped(ctx, "Skipping barrier encryption test. Only supported for auto-unseal.")
			return nil
		}
		barrierUUID, err := uuid.GenerateUUID()
		if err != nil {
			return fmt.Errorf("Diagnose could not create unique UUID for unsealing.")
		}
		barrierEncValue := "diagnose-" + barrierUUID
		ciphertext, err := barrierWrapper.Encrypt(ctx, []byte(barrierEncValue), nil)
		if err != nil {
			return fmt.Errorf("Error encrypting with seal barrier: %w.", err)
		}
		plaintext, err := barrierWrapper.Decrypt(ctx, ciphertext, nil)
		if err != nil {
			return fmt.Errorf("Error decrypting with seal barrier: %w", err)

		}
		if string(plaintext) != barrierEncValue {
			return fmt.Errorf("Barrier returned incorrect decrypted value for mock data.")
		}
		return nil
	}))

	// The following block contains static checks that are run during the
	// startHttpServers portion of server run. In other words, they are static
	// checks during resource creation. Currently there is nothing important in this
	// diagnose check. For now it is a placeholder for any checks that will be done
	// before server run.
	diagnose.Test(ctx, "Check Server Before Runtime", func(ctx context.Context) error {
		for _, ln := range lns {
			if ln.Config == nil {
				return fmt.Errorf("Found no listener config after parsing the Vault configuration.")
			}
		}
		return nil
	})
	return nil
}