open-consul/command/agent.go
Frank Schröder 69a088ca85 New config parser, HCL support, multiple bind addrs (#3480)
* new config parser for agent

This patch implements a new config parser for the consul agent which
makes the following changes to the previous implementation:

 * add HCL support
 * all configuration fragments in tests and for default config are
   expressed as HCL fragments
 * HCL fragments can be provided on the command line so that they
   can eventually replace the command line flags.
 * HCL/JSON fragments are parsed into a temporary Config structure
   which can be merged using reflection (all values are pointers).
   The existing merge logic of overwrite for values and append
   for slices has been preserved.
 * A single builder process generates a typed runtime configuration
   for the agent.

The new implementation is more strict and fails in the builder process
if no valid runtime configuration can be generated. Therefore,
additional validations in other parts of the code should be removed.

The builder also pre-computes all required network addresses so that no
address/port magic should be required where the configuration is used
and should therefore be removed.

* Upgrade github.com/hashicorp/hcl to support int64

* improve error messages

* fix directory permission test

* Fix rtt test

* Fix ForceLeave test

* Skip performance test for now until we know what to do

* Update github.com/hashicorp/memberlist to update log prefix

* Make memberlist use the default logger

* improve config error handling

* do not fail on non-existing data-dir

* experiment with non-uniform timeouts to get a handle on stalled leader elections

* Run tests for packages separately to eliminate the spurious port conflicts

* refactor private address detection and unify approach for ipv4 and ipv6.

Fixes #2825

* do not allow unix sockets for DNS

* improve bind and advertise addr error handling

* go through builder using test coverage

* minimal update to the docs

* more coverage tests fixed

* more tests

* fix makefile

* cleanup

* fix port conflicts with external port server 'porter'

* stop test server on error

* do not run api test that change global ENV concurrently with the other tests

* Run remaining api tests concurrently

* no need for retry with the port number service

* monkey patch race condition in go-sockaddr until we understand why that fails

* monkey patch hcl decoder race condidtion until we understand why that fails

* monkey patch spurious errors in strings.EqualFold from here

* add test for hcl decoder race condition. Run with go test -parallel 128

* Increase timeout again

* cleanup

* don't log port allocations by default

* use base command arg parsing to format help output properly

* handle -dc deprecation case in Build

* switch autopilot.max_trailing_logs to int

* remove duplicate test case

* remove unused methods

* remove comments about flag/config value inconsistencies

* switch got and want around since the error message was misleading.

* Removes a stray debug log.

* Removes a stray newline in imports.

* Fixes TestACL_Version8.

* Runs go fmt.

* Adds a default case for unknown address types.

* Reoders and reformats some imports.

* Adds some comments and fixes typos.

* Reorders imports.

* add unix socket support for dns later

* drop all deprecated flags and arguments

* fix wrong field name

* remove stray node-id file

* drop unnecessary patch section in test

* drop duplicate test

* add test for LeaveOnTerm and SkipLeaveOnInt in client mode

* drop "bla" and add clarifying comment for the test

* split up tests to support enterprise/non-enterprise tests

* drop raft multiplier and derive values during build phase

* sanitize runtime config reflectively and add test

* detect invalid config fields

* fix tests with invalid config fields

* use different values for wan sanitiziation test

* drop recursor in favor of recursors

* allow dns_config.udp_answer_limit to be zero

* make sure tests run on machines with multiple ips

* Fix failing tests in a few more places by providing a bind address in the test

* Gets rid of skipped TestAgent_CheckPerformanceSettings and adds case for builder.

* Add porter to server_test.go to make tests there less flaky

* go fmt
2017-09-25 11:40:42 -07:00

499 lines
14 KiB
Go

package command
import (
"encoding/json"
"fmt"
"io"
"log"
"os"
"os/signal"
"path/filepath"
"regexp"
"strings"
"syscall"
"time"
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/circonus"
"github.com/armon/go-metrics/datadog"
"github.com/hashicorp/consul/agent"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/logger"
"github.com/hashicorp/go-checkpoint"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/logutils"
"github.com/mitchellh/cli"
)
// validDatacenter is used to validate a datacenter
var validDatacenter = regexp.MustCompile("^[a-zA-Z0-9_-]+$")
// AgentCommand is a Command implementation that runs a Consul agent.
// The command will not end unless a shutdown message is sent on the
// ShutdownCh. If two messages are sent on the ShutdownCh it will forcibly
// exit.
type AgentCommand struct {
BaseCommand
Revision string
Version string
VersionPrerelease string
HumanVersion string
ShutdownCh <-chan struct{}
args []string
logFilter *logutils.LevelFilter
logOutput io.Writer
logger *log.Logger
}
// readConfig is responsible for setup of our configuration using
// the command line and any file configs
func (cmd *AgentCommand) readConfig() *config.RuntimeConfig {
var flags config.Flags
fs := cmd.BaseCommand.NewFlagSet(cmd)
config.AddFlags(fs, &flags)
if err := cmd.BaseCommand.Parse(cmd.args); err != nil {
cmd.UI.Error(fmt.Sprintf("error parsing flags: %v", err))
return nil
}
b, err := config.NewBuilder(flags)
if err != nil {
cmd.UI.Error(err.Error())
return nil
}
cfg, err := b.BuildAndValidate()
if err != nil {
cmd.UI.Error(err.Error())
return nil
}
for _, w := range b.Warnings {
cmd.UI.Warn(w)
}
return &cfg
}
// checkpointResults is used to handler periodic results from our update checker
func (cmd *AgentCommand) checkpointResults(results *checkpoint.CheckResponse, err error) {
if err != nil {
cmd.UI.Error(fmt.Sprintf("Failed to check for updates: %v", err))
return
}
if results.Outdated {
cmd.UI.Error(fmt.Sprintf("Newer Consul version available: %s (currently running: %s)", results.CurrentVersion, cmd.Version))
}
for _, alert := range results.Alerts {
switch alert.Level {
case "info":
cmd.UI.Info(fmt.Sprintf("Bulletin [%s]: %s (%s)", alert.Level, alert.Message, alert.URL))
default:
cmd.UI.Error(fmt.Sprintf("Bulletin [%s]: %s (%s)", alert.Level, alert.Message, alert.URL))
}
}
}
func (cmd *AgentCommand) startupUpdateCheck(config *config.RuntimeConfig) {
version := config.Version
if config.VersionPrerelease != "" {
version += fmt.Sprintf("-%s", config.VersionPrerelease)
}
updateParams := &checkpoint.CheckParams{
Product: "consul",
Version: version,
}
if !config.DisableAnonymousSignature {
updateParams.SignatureFile = filepath.Join(config.DataDir, "checkpoint-signature")
}
// Schedule a periodic check with expected interval of 24 hours
checkpoint.CheckInterval(updateParams, 24*time.Hour, cmd.checkpointResults)
// Do an immediate check within the next 30 seconds
go func() {
time.Sleep(lib.RandomStagger(30 * time.Second))
cmd.checkpointResults(checkpoint.Check(updateParams))
}()
}
// startupJoin is invoked to handle any joins specified to take place at start time
func (cmd *AgentCommand) startupJoin(agent *agent.Agent, cfg *config.RuntimeConfig) error {
if len(cfg.StartJoinAddrsLAN) == 0 {
return nil
}
cmd.UI.Output("Joining cluster...")
n, err := agent.JoinLAN(cfg.StartJoinAddrsLAN)
if err != nil {
return err
}
cmd.UI.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n))
return nil
}
// startupJoinWan is invoked to handle any joins -wan specified to take place at start time
func (cmd *AgentCommand) startupJoinWan(agent *agent.Agent, cfg *config.RuntimeConfig) error {
if len(cfg.StartJoinAddrsWAN) == 0 {
return nil
}
cmd.UI.Output("Joining -wan cluster...")
n, err := agent.JoinWAN(cfg.StartJoinAddrsWAN)
if err != nil {
return err
}
cmd.UI.Info(fmt.Sprintf("Join -wan completed. Synced with %d initial agents", n))
return nil
}
func statsiteSink(config *config.RuntimeConfig, hostname string) (metrics.MetricSink, error) {
if config.TelemetryStatsiteAddr == "" {
return nil, nil
}
return metrics.NewStatsiteSink(config.TelemetryStatsiteAddr)
}
func statsdSink(config *config.RuntimeConfig, hostname string) (metrics.MetricSink, error) {
if config.TelemetryStatsdAddr == "" {
return nil, nil
}
return metrics.NewStatsdSink(config.TelemetryStatsdAddr)
}
func dogstatdSink(config *config.RuntimeConfig, hostname string) (metrics.MetricSink, error) {
if config.TelemetryDogstatsdAddr == "" {
return nil, nil
}
sink, err := datadog.NewDogStatsdSink(config.TelemetryDogstatsdAddr, hostname)
if err != nil {
return nil, err
}
sink.SetTags(config.TelemetryDogstatsdTags)
return sink, nil
}
func circonusSink(config *config.RuntimeConfig, hostname string) (metrics.MetricSink, error) {
if config.TelemetryCirconusAPIToken == "" && config.TelemetryCirconusSubmissionURL == "" {
return nil, nil
}
cfg := &circonus.Config{}
cfg.Interval = config.TelemetryCirconusSubmissionInterval
cfg.CheckManager.API.TokenKey = config.TelemetryCirconusAPIToken
cfg.CheckManager.API.TokenApp = config.TelemetryCirconusAPIApp
cfg.CheckManager.API.URL = config.TelemetryCirconusAPIURL
cfg.CheckManager.Check.SubmissionURL = config.TelemetryCirconusSubmissionURL
cfg.CheckManager.Check.ID = config.TelemetryCirconusCheckID
cfg.CheckManager.Check.ForceMetricActivation = config.TelemetryCirconusCheckForceMetricActivation
cfg.CheckManager.Check.InstanceID = config.TelemetryCirconusCheckInstanceID
cfg.CheckManager.Check.SearchTag = config.TelemetryCirconusCheckSearchTag
cfg.CheckManager.Check.DisplayName = config.TelemetryCirconusCheckDisplayName
cfg.CheckManager.Check.Tags = config.TelemetryCirconusCheckTags
cfg.CheckManager.Broker.ID = config.TelemetryCirconusBrokerID
cfg.CheckManager.Broker.SelectTag = config.TelemetryCirconusBrokerSelectTag
if cfg.CheckManager.Check.DisplayName == "" {
cfg.CheckManager.Check.DisplayName = "Consul"
}
if cfg.CheckManager.API.TokenApp == "" {
cfg.CheckManager.API.TokenApp = "consul"
}
if cfg.CheckManager.Check.SearchTag == "" {
cfg.CheckManager.Check.SearchTag = "service:consul"
}
sink, err := circonus.NewCirconusSink(cfg)
if err != nil {
return nil, err
}
sink.Start()
return sink, nil
}
func startupTelemetry(conf *config.RuntimeConfig) (*metrics.InmemSink, error) {
// Setup telemetry
// Aggregate on 10 second intervals for 1 minute. Expose the
// metrics over stderr when there is a SIGUSR1 received.
memSink := metrics.NewInmemSink(10*time.Second, time.Minute)
metrics.DefaultInmemSignal(memSink)
metricsConf := metrics.DefaultConfig(conf.TelemetryStatsitePrefix)
metricsConf.EnableHostname = !conf.TelemetryDisableHostname
metricsConf.FilterDefault = conf.TelemetryFilterDefault
var sinks metrics.FanoutSink
addSink := func(name string, fn func(*config.RuntimeConfig, string) (metrics.MetricSink, error)) error {
s, err := fn(conf, metricsConf.HostName)
if err != nil {
return err
}
if s != nil {
sinks = append(sinks, s)
}
return nil
}
if err := addSink("statsite", statsiteSink); err != nil {
return nil, err
}
if err := addSink("statsd", statsdSink); err != nil {
return nil, err
}
if err := addSink("dogstatd", dogstatdSink); err != nil {
return nil, err
}
if err := addSink("circonus", circonusSink); err != nil {
return nil, err
}
if len(sinks) > 0 {
sinks = append(sinks, memSink)
metrics.NewGlobal(metricsConf, sinks)
} else {
metricsConf.EnableHostname = false
metrics.NewGlobal(metricsConf, memSink)
}
return memSink, nil
}
func (cmd *AgentCommand) Run(args []string) int {
code := cmd.run(args)
if cmd.logger != nil {
cmd.logger.Println("[INFO] Exit code:", code)
}
return code
}
func (cmd *AgentCommand) run(args []string) int {
cmd.UI = &cli.PrefixedUi{
OutputPrefix: "==> ",
InfoPrefix: " ",
ErrorPrefix: "==> ",
Ui: cmd.UI,
}
// Parse our configs
cmd.args = args
config := cmd.readConfig()
if config == nil {
return 1
}
// Setup the log outputs
logConfig := &logger.Config{
LogLevel: config.LogLevel,
EnableSyslog: config.EnableSyslog,
SyslogFacility: config.SyslogFacility,
}
logFilter, logGate, logWriter, logOutput, ok := logger.Setup(logConfig, cmd.UI)
if !ok {
return 1
}
cmd.logFilter = logFilter
cmd.logOutput = logOutput
cmd.logger = log.New(logOutput, "", log.LstdFlags)
memSink, err := startupTelemetry(config)
if err != nil {
cmd.UI.Error(err.Error())
return 1
}
// Create the agent
cmd.UI.Output("Starting Consul agent...")
agent, err := agent.New(config)
if err != nil {
cmd.UI.Error(fmt.Sprintf("Error creating agent: %s", err))
return 1
}
agent.LogOutput = logOutput
agent.LogWriter = logWriter
agent.MemSink = memSink
if err := agent.Start(); err != nil {
cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err))
return 1
}
// shutdown agent before endpoints
defer agent.ShutdownEndpoints()
defer agent.ShutdownAgent()
if !config.DisableUpdateCheck {
cmd.startupUpdateCheck(config)
}
if err := cmd.startupJoin(agent, config); err != nil {
cmd.UI.Error(err.Error())
return 1
}
if err := cmd.startupJoinWan(agent, config); err != nil {
cmd.UI.Error(err.Error())
return 1
}
// Let the agent know we've finished registration
agent.StartSync()
segment := config.SegmentName
if config.ServerMode {
segment = "<all>"
}
cmd.UI.Output("Consul agent running!")
cmd.UI.Info(fmt.Sprintf(" Version: '%s'", cmd.HumanVersion))
cmd.UI.Info(fmt.Sprintf(" Node ID: '%s'", config.NodeID))
cmd.UI.Info(fmt.Sprintf(" Node name: '%s'", config.NodeName))
cmd.UI.Info(fmt.Sprintf(" Datacenter: '%s' (Segment: '%s')", config.Datacenter, segment))
cmd.UI.Info(fmt.Sprintf(" Server: %v (Bootstrap: %v)", config.ServerMode, config.Bootstrap))
cmd.UI.Info(fmt.Sprintf(" Client Addr: %v (HTTP: %d, HTTPS: %d, DNS: %d)", config.ClientAddrs,
config.HTTPPort, config.HTTPSPort, config.DNSPort))
cmd.UI.Info(fmt.Sprintf(" Cluster Addr: %v (LAN: %d, WAN: %d)", config.AdvertiseAddrLAN,
config.SerfPortLAN, config.SerfPortWAN))
cmd.UI.Info(fmt.Sprintf(" Encrypt: Gossip: %v, TLS-Outgoing: %v, TLS-Incoming: %v",
agent.GossipEncrypted(), config.VerifyOutgoing, config.VerifyIncoming))
// Enable log streaming
cmd.UI.Info("")
cmd.UI.Output("Log data will now stream in as it occurs:\n")
logGate.Flush()
// wait for signal
signalCh := make(chan os.Signal, 4)
signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP)
signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGPIPE)
for {
var sig os.Signal
var reloadErrCh chan error
select {
case s := <-signalCh:
sig = s
case ch := <-agent.ReloadCh():
sig = syscall.SIGHUP
reloadErrCh = ch
case <-cmd.ShutdownCh:
sig = os.Interrupt
case err := <-agent.RetryJoinCh():
cmd.logger.Println("[ERR] Retry join failed: ", err)
return 1
case <-agent.ShutdownCh():
// agent is already down!
return 0
}
switch sig {
case syscall.SIGPIPE:
continue
case syscall.SIGHUP:
cmd.logger.Println("[INFO] Caught signal: ", sig)
conf, err := cmd.handleReload(agent, config)
if conf != nil {
config = conf
}
if err != nil {
cmd.logger.Println("[ERR] Reload config failed: ", err)
}
// Send result back if reload was called via HTTP
if reloadErrCh != nil {
reloadErrCh <- err
}
default:
cmd.logger.Println("[INFO] Caught signal: ", sig)
graceful := (sig == os.Interrupt && !(config.SkipLeaveOnInt)) || (sig == syscall.SIGTERM && (config.LeaveOnTerm))
if !graceful {
cmd.logger.Println("[INFO] Graceful shutdown disabled. Exiting")
return 1
}
cmd.logger.Println("[INFO] Gracefully shutting down agent...")
gracefulCh := make(chan struct{})
go func() {
if err := agent.Leave(); err != nil {
cmd.logger.Println("[ERR] Error on leave:", err)
return
}
close(gracefulCh)
}()
gracefulTimeout := 15 * time.Second
select {
case <-signalCh:
cmd.logger.Printf("[INFO] Caught second signal %v. Exiting\n", sig)
return 1
case <-time.After(gracefulTimeout):
cmd.logger.Println("[INFO] Timeout on graceful leave. Exiting")
return 1
case <-gracefulCh:
cmd.logger.Println("[INFO] Graceful exit completed")
return 0
}
}
}
}
// handleReload is invoked when we should reload our configs, e.g. SIGHUP
func (cmd *AgentCommand) handleReload(agent *agent.Agent, cfg *config.RuntimeConfig) (*config.RuntimeConfig, error) {
cmd.logger.Println("[INFO] Reloading configuration...")
var errs error
newCfg := cmd.readConfig()
if newCfg == nil {
errs = multierror.Append(errs, fmt.Errorf("Failed to reload configs"))
return cfg, errs
}
// Change the log level
minLevel := logutils.LogLevel(strings.ToUpper(newCfg.LogLevel))
if logger.ValidateLevelFilter(minLevel, cmd.logFilter) {
cmd.logFilter.SetMinLevel(minLevel)
} else {
errs = multierror.Append(fmt.Errorf(
"Invalid log level: %s. Valid log levels are: %v",
minLevel, cmd.logFilter.Levels))
// Keep the current log level
newCfg.LogLevel = cfg.LogLevel
}
if err := agent.ReloadConfig(newCfg); err != nil {
errs = multierror.Append(fmt.Errorf(
"Failed to reload configs: %v", err))
}
return cfg, errs
}
func (cmd *AgentCommand) Synopsis() string {
return "Runs a Consul agent"
}
func (cmd *AgentCommand) Help() string {
helpText := `
Usage: consul agent [options]
Starts the Consul agent and runs until an interrupt is received. The
agent represents a single node in a cluster.
` + cmd.BaseCommand.Help()
return strings.TrimSpace(helpText)
}
func printJSON(name string, v interface{}) {
fmt.Println(name)
b, err := json.MarshalIndent(v, "", " ")
if err != nil {
fmt.Printf("%#v\n", v)
return
}
fmt.Println(string(b))
}