diff --git a/CHANGELOG.md b/CHANGELOG.md index df97d856e..101c70c0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -134,7 +134,6 @@ FEATURES: * **Support for Running Subproccesses Directly Without a Shell:** Consul agent checks and watches now support an `args` configuration which is a list of arguments to run for the subprocess, which runs the subprocess directly without a shell. The old `script` and `handler` configurations are now deprecated (specify a shell explicitly if you require one). A `-shell=false` option is also available on `consul lock`, `consul watch`, and `consul exec` to run the subprocesses associated with those without a shell. [[GH-3509](https://github.com/hashicorp/consul/issues/3509)] * **Sentinel Integration:** (Consul Enterprise) Consul's ACL system integrates with [Sentinel](https://www.consul.io/docs/guides/sentinel.html) to enable code policies that apply to KV writes. - IMPROVEMENTS: * agent: Added support to detect public IPv4 and IPv6 addresses on AWS. [[GH-3471](https://github.com/hashicorp/consul/issues/3471)] @@ -142,6 +141,7 @@ IMPROVEMENTS: * agent: Improved ACL system for the KV store to support list permissions. This behavior can be opted in. For more information, see the [ACL Guide](https://www.consul.io/docs/guides/acl.html#list-policy-for-keys). [[GH-3511](https://github.com/hashicorp/consul/issues/3511)] * agent: Updates miekg/dns library to later version to pick up bug fixes and improvements. [[GH-3547](https://github.com/hashicorp/consul/issues/3547)] * agent: Added automatic retries to the RPC path, and a brief RPC drain time when servers leave. These changes make Consul more robust during graceful leaves of Consul servers, such as during upgrades, and help shield applications from "no leader" errors. These are configured with new [`performance`](https://www.consul.io/docs/agent/options.html#performance) options. [[GH-5414](https://github.com/hashicorp/consul/issues/5414)] +* agent: Added a new `discard_check_output` agent-level configuration option that can be used to trade off write load to the Consul servers vs. visibility of health check output. This is reloadable so it can be toggled without fully restarting the agent. [[GH-3562](https://github.com/hashicorp/consul/issues/3562)] * build: Updated Go toolchain to version 1.9.1. [[GH-3537](https://github.com/hashicorp/consul/issues/3537)] * cli: `consul lock` and `consul watch` commands will forward `TERM` and `KILL` signals to their child subprocess. [[GH-3509](https://github.com/hashicorp/consul/issues/3509)] * cli: Added support for [autocompletion](https://www.consul.io/docs/commands/index.html#autocompletion). [[GH-3412](https://github.com/hashicorp/consul/issues/3412)] diff --git a/agent/agent.go b/agent/agent.go index b9479c0e4..6eec5d27e 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -2386,5 +2386,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { // Update filtered metrics metrics.UpdateFilter(newCfg.TelemetryAllowedPrefixes, newCfg.TelemetryBlockedPrefixes) + a.state.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) + return nil } diff --git a/agent/config/builder.go b/agent/config/builder.go index 414fe778f..890074282 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -579,6 +579,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { DisableKeyringFile: b.boolVal(c.DisableKeyringFile), DisableRemoteExec: b.boolVal(c.DisableRemoteExec), DisableUpdateCheck: b.boolVal(c.DisableUpdateCheck), + DiscardCheckOutput: b.boolVal(c.DiscardCheckOutput), EnableDebug: b.boolVal(c.EnableDebug), EnableScriptChecks: b.boolVal(c.EnableScriptChecks), EnableSyslog: b.boolVal(c.EnableSyslog), diff --git a/agent/config/config.go b/agent/config/config.go index ab98118ec..03c0d6994 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -173,6 +173,7 @@ type Config struct { DisableKeyringFile *bool `json:"disable_keyring_file,omitempty" hcl:"disable_keyring_file" mapstructure:"disable_keyring_file"` DisableRemoteExec *bool `json:"disable_remote_exec,omitempty" hcl:"disable_remote_exec" mapstructure:"disable_remote_exec"` DisableUpdateCheck *bool `json:"disable_update_check,omitempty" hcl:"disable_update_check" mapstructure:"disable_update_check"` + DiscardCheckOutput *bool `json:"discard_check_output" hcl:"discard_check_output" mapstructure:"discard_check_output"` EnableACLReplication *bool `json:"enable_acl_replication,omitempty" hcl:"enable_acl_replication" mapstructure:"enable_acl_replication"` EnableDebug *bool `json:"enable_debug,omitempty" hcl:"enable_debug" mapstructure:"enable_debug"` EnableScriptChecks *bool `json:"enable_script_checks,omitempty" hcl:"enable_script_checks" mapstructure:"enable_script_checks"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 5fc2ab361..e3ef7ad2b 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -133,6 +133,7 @@ type RuntimeConfig struct { DisableKeyringFile bool DisableRemoteExec bool DisableUpdateCheck bool + DiscardCheckOutput bool EnableACLReplication bool EnableDebug bool EnableScriptChecks bool diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 77f50ddf7..3076d97fc 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -2116,6 +2116,7 @@ func TestFullConfig(t *testing.T) { "disable_keyring_file": true, "disable_remote_exec": true, "disable_update_check": true, + "discard_check_output": true, "domain": "7W1xXSqd", "dns_config": { "allow_stale": true, @@ -2549,6 +2550,7 @@ func TestFullConfig(t *testing.T) { disable_keyring_file = true disable_remote_exec = true disable_update_check = true + discard_check_output = true domain = "7W1xXSqd" dns_config { allow_stale = true @@ -3132,6 +3134,7 @@ func TestFullConfig(t *testing.T) { DisableKeyringFile: true, DisableRemoteExec: true, DisableUpdateCheck: true, + DiscardCheckOutput: true, EnableACLReplication: true, EnableDebug: true, EnableScriptChecks: true, @@ -3808,6 +3811,7 @@ func TestSanitize(t *testing.T) { "DisableKeyringFile": false, "DisableRemoteExec": false, "DisableUpdateCheck": false, + "DiscardCheckOutput": false, "EnableACLReplication": false, "EnableDebug": false, "EnableScriptChecks": false, diff --git a/agent/local.go b/agent/local.go index 2fe90a1a5..2a29a9c6c 100644 --- a/agent/local.go +++ b/agent/local.go @@ -88,6 +88,10 @@ type localState struct { // triggerCh is used to inform of a change to local state // that requires anti-entropy with the server triggerCh chan struct{} + + // discardCheckOutput stores whether the output of health checks + // is stored in the raft log. + discardCheckOutput atomic.Value // bool } // NewLocalState creates a is used to initialize the local state @@ -106,7 +110,7 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) lc.TaggedAddresses[k] = v } - return &localState{ + l := &localState{ config: lc, logger: lg, services: make(map[string]*structs.NodeService), @@ -121,6 +125,8 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) consulCh: make(chan struct{}, 1), triggerCh: make(chan struct{}, 1), } + l.discardCheckOutput.Store(c.DiscardCheckOutput) + return l } // changeMade is used to trigger an anti-entropy run @@ -161,6 +167,10 @@ func (l *localState) isPaused() bool { return atomic.LoadInt32(&l.paused) > 0 } +func (l *localState) SetDiscardCheckOutput(b bool) { + l.discardCheckOutput.Store(b) +} + // ServiceToken returns the configured ACL token for the given // service ID. If none is present, the agent's token is returned. func (l *localState) ServiceToken(id string) string { @@ -249,11 +259,15 @@ func (l *localState) checkToken(checkID types.CheckID) string { // This entry is persistent and the agent will make a best effort to // ensure it is registered func (l *localState) AddCheck(check *structs.HealthCheck, token string) error { + l.Lock() + defer l.Unlock() + // Set the node name check.Node = l.config.NodeName - l.Lock() - defer l.Unlock() + if l.discardCheckOutput.Load().(bool) { + check.Output = "" + } // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor @@ -293,6 +307,10 @@ func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) { return } + if l.discardCheckOutput.Load().(bool) { + output = "" + } + // Update the critical time tracking (this doesn't cause a server updates // so we can always keep this up to date). if status == api.HealthCritical { diff --git a/agent/local_test.go b/agent/local_test.go index 2b2d78d3d..2d0d2fb02 100644 --- a/agent/local_test.go +++ b/agent/local_test.go @@ -1151,6 +1151,56 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } } +func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { + t.Parallel() + a := NewTestAgent(t.Name(), ` + discard_check_output = true + check_update_interval = "0s" # set to "0s" since otherwise output checks are deferred + `) + defer a.Shutdown() + + inSync := func(id string) bool { + a.state.Lock() + defer a.state.Unlock() + return a.state.checkStatus[types.CheckID(id)].inSync + } + + // register a check + check := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "web", + Name: "web", + Status: api.HealthPassing, + Output: "first output", + } + if err := a.state.AddCheck(check, ""); err != nil { + t.Fatalf("bad: %s", err) + } + + // wait until the check is in sync + retry.Run(t, func(r *retry.R) { + if inSync("web") { + return + } + r.FailNow() + }) + + // update the check with the same status but different output + // and the check should still be in sync. + a.state.UpdateCheck(check.CheckID, api.HealthPassing, "second output") + if !inSync("web") { + t.Fatal("check should be in sync") + } + + // disable discarding of check output and update the check again with different + // output. Then the check should be out of sync. + a.state.SetDiscardCheckOutput(false) + a.state.UpdateCheck(check.CheckID, api.HealthPassing, "third output") + if inSync("web") { + t.Fatal("check should be out of sync") + } +} + func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { t.Parallel() a := &TestAgent{Name: t.Name(), HCL: ` diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md index 7eeaae208..346d1404a 100644 --- a/website/source/docs/agent/options.html.md +++ b/website/source/docs/agent/options.html.md @@ -771,6 +771,11 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass * `disable_update_check` Disables automatic checking for security bulletins and new version releases. +* `discard_check_output` + Discards the output of health checks before storing them. This reduces the number of writes + to the Consul raft log in environments where health checks have volatile output like + timestamps, process ids, ... + * `dns_config` This object allows a number of sub-keys to be set which can tune how DNS queries are serviced. See this guide on [DNS caching](/docs/guides/dns-cache.html) for more detail. @@ -1331,4 +1336,5 @@ items which are reloaded include: * Watches * HTTP Client Address * Node Metadata -* Metric Prefix Filter +* Metric Prefix Filter +* Discard Check Output