Adds new config to make script checks opt-in, updates documentation. (#3284)

2017-07-17 11:20:35 -07:00 · 2017-07-17 11:20:35 -07:00 · 788dd255a1
parent f7629a4f66
commit 788dd255a1
12 changed files with 125 additions and 26 deletions
--- a/agent/agent.go
+++ b/agent/agent.go
@ -1595,8 +1595,15 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType,
 	if check.CheckID == "" {
 		return fmt.Errorf("CheckID missing")
 	}
-	if chkType != nil && !chkType.Valid() {
-		return fmt.Errorf("Check type is not valid")
+
+	if chkType != nil {
+		if !chkType.Valid() {
+			return fmt.Errorf("Check type is not valid")
+		}
+
+		if chkType.IsScript() && !a.config.EnableScriptChecks {
+			return fmt.Errorf("Check types that exec scripts are disabled on this agent")
+		}
 	}

 	if check.ServiceID != "" {
--- a/agent/agent_test.go
+++ b/agent/agent_test.go
@ -628,7 +628,9 @@ func TestAgent_RemoveServiceRemovesAllChecks(t *testing.T) {

 func TestAgent_AddCheck(t *testing.T) {
 	t.Parallel()
-	a := NewTestAgent(t.Name(), nil)
+	cfg := TestConfig()
+	cfg.EnableScriptChecks = true
+	a := NewTestAgent(t.Name(), cfg)
 	defer a.Shutdown()

 	health := &structs.HealthCheck{
@ -665,7 +667,9 @@ func TestAgent_AddCheck(t *testing.T) {

 func TestAgent_AddCheck_StartPassing(t *testing.T) {
 	t.Parallel()
-	a := NewTestAgent(t.Name(), nil)
+	cfg := TestConfig()
+	cfg.EnableScriptChecks = true
+	a := NewTestAgent(t.Name(), cfg)
 	defer a.Shutdown()

 	health := &structs.HealthCheck{
@ -702,7 +706,9 @@ func TestAgent_AddCheck_StartPassing(t *testing.T) {

 func TestAgent_AddCheck_MinInterval(t *testing.T) {
 	t.Parallel()
-	a := NewTestAgent(t.Name(), nil)
+	cfg := TestConfig()
+	cfg.EnableScriptChecks = true
+	a := NewTestAgent(t.Name(), cfg)
 	defer a.Shutdown()

 	health := &structs.HealthCheck{
@ -735,7 +741,9 @@ func TestAgent_AddCheck_MinInterval(t *testing.T) {

 func TestAgent_AddCheck_MissingService(t *testing.T) {
 	t.Parallel()
-	a := NewTestAgent(t.Name(), nil)
+	cfg := TestConfig()
+	cfg.EnableScriptChecks = true
+	a := NewTestAgent(t.Name(), cfg)
 	defer a.Shutdown()

 	health := &structs.HealthCheck{
@ -797,9 +805,38 @@ func TestAgent_AddCheck_RestoreState(t *testing.T) {
 	}
 }

+func TestAgent_AddCheck_ExecDisable(t *testing.T) {
+	t.Parallel()
+
+	a := NewTestAgent(t.Name(), nil)
+	defer a.Shutdown()
+
+	health := &structs.HealthCheck{
+		Node:    "foo",
+		CheckID: "mem",
+		Name:    "memory util",
+		Status:  api.HealthCritical,
+	}
+	chk := &structs.CheckType{
+		Script:   "exit 0",
+		Interval: 15 * time.Second,
+	}
+	err := a.AddCheck(health, chk, false, "")
+	if err == nil || !strings.Contains(err.Error(), "exec scripts are disabled on this agent") {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Ensure we don't have a check mapping
+	if memChk := a.state.Checks()["mem"]; memChk != nil {
+		t.Fatalf("should be missing mem check")
+	}
+}
+
 func TestAgent_RemoveCheck(t *testing.T) {
 	t.Parallel()
-	a := NewTestAgent(t.Name(), nil)
+	cfg := TestConfig()
+	cfg.EnableScriptChecks = true
+	a := NewTestAgent(t.Name(), cfg)
 	defer a.Shutdown()

 	// Remove check that doesn't exist
@ -1097,6 +1134,7 @@ func TestAgent_PersistCheck(t *testing.T) {
 	cfg := TestConfig()
 	cfg.Server = false
 	cfg.DataDir = testutil.TempDir(t, "agent") // we manage the data dir
+	cfg.EnableScriptChecks = true
 	a := NewTestAgent(t.Name(), cfg)
 	defer os.RemoveAll(cfg.DataDir)
 	defer a.Shutdown()
@ -1230,6 +1268,7 @@ func TestAgent_PurgeCheckOnDuplicate(t *testing.T) {
 	cfg := TestConfig()
 	cfg.Server = false
 	cfg.DataDir = testutil.TempDir(t, "agent") // we manage the data dir
+	cfg.EnableScriptChecks = true
 	a := NewTestAgent(t.Name(), cfg)
 	defer os.RemoveAll(cfg.DataDir)
 	defer a.Shutdown()
--- a/agent/config.go
+++ b/agent/config.go
@ -625,6 +625,11 @@ type Config struct {
 	// true, we ignore the leave, and rejoin the cluster on start.
 	RejoinAfterLeave bool `mapstructure:"rejoin_after_leave"`

+	// EnableScriptChecks controls whether health checks which execute
+	// scripts are enabled. This includes regular script checks and Docker
+	// checks.
+	EnableScriptChecks bool `mapstructure:"enable_script_checks"`
+
 	// CheckUpdateInterval controls the interval on which the output of a health check
 	// is updated if there is no change to the state. For example, a check in a steady
 	// state may run every 5 second generating a unique output (timestamp, etc), forcing
@ -1932,6 +1937,9 @@ func MergeConfig(a, b *Config) *Config {
 	if b.DNSConfig.RecursorTimeout != 0 {
 		result.DNSConfig.RecursorTimeout = b.DNSConfig.RecursorTimeout
 	}
+	if b.EnableScriptChecks {
+		result.EnableScriptChecks = true
+	}
 	if b.CheckUpdateIntervalRaw != "" || b.CheckUpdateInterval != 0 {
 		result.CheckUpdateInterval = b.CheckUpdateInterval
 	}
--- a/agent/config_test.go
+++ b/agent/config_test.go
@ -322,6 +322,10 @@ func TestDecodeConfig(t *testing.T) {
 			in: `{"disable_keyring_file":true}`,
 			c:  &Config{DisableKeyringFile: true},
 		},
+		{
+			in: `{"enable_script_checks":true}`,
+			c:  &Config{EnableScriptChecks: true},
+		},
 		{
 			in: `{"encrypt_verify_incoming":true}`,
 			c:  &Config{EncryptVerifyIncoming: Bool(true)},
@ -1363,6 +1367,7 @@ func TestMergeConfig(t *testing.T) {
 		ReconnectTimeoutLan:    24 * time.Hour,
 		ReconnectTimeoutWanRaw: "36h",
 		ReconnectTimeoutWan:    36 * time.Hour,
+		EnableScriptChecks:     true,
 		CheckUpdateInterval:    8 * time.Minute,
 		CheckUpdateIntervalRaw: "8m",
 		ACLToken:               "1111",
--- a/agent/consul/structs/check_type.go
+++ b/agent/consul/structs/check_type.go
@ -47,6 +47,11 @@ func (c *CheckType) Valid() bool {
 	return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() || c.IsDocker()
 }

+// IsScript checks if this is a check that execs some kind of script.
+func (c *CheckType) IsScript() bool {
+	return c.Script != ""
+}
+
 // IsTTL checks if this is a TTL type
 func (c *CheckType) IsTTL() bool {
 	return c.TTL != 0
--- a/api/agent_test.go
+++ b/api/agent_test.go
@ -529,7 +529,9 @@ func TestAPI_AgentChecks_serviceBound(t *testing.T) {

 func TestAPI_AgentChecks_Docker(t *testing.T) {
 	t.Parallel()
-	c, s := makeClient(t)
+	c, s := makeClientWithConfig(t, nil, func(c *testutil.TestServerConfig) {
+		c.EnableScriptChecks = true
+	})
 	defer s.Stop()

 	agent := c.Agent()
--- a/command/agent.go
+++ b/command/agent.go
@ -80,6 +80,7 @@ func (cmd *AgentCommand) readConfig() *agent.Config {
 		"A unique ID for this node across space and time. Defaults to a randomly-generated ID"+
 			" that persists in the data-dir.")

+	f.BoolVar(&cmdCfg.EnableScriptChecks, "enable-script-checks", false, "Enables health check scripts.")
 	var disableHostNodeID configutil.BoolValue
 	f.Var(&disableHostNodeID, "disable-host-node-id",
 		"Setting this to true will prevent Consul from using information from the"+
--- a/testutil/server.go
+++ b/testutil/server.go
@ -86,6 +86,7 @@ type TestServerConfig struct {
 	VerifyIncomingRPC   bool                   `json:"verify_incoming_rpc,omitempty"`
 	VerifyIncomingHTTPS bool                   `json:"verify_incoming_https,omitempty"`
 	VerifyOutgoing      bool                   `json:"verify_outgoing,omitempty"`
+	EnableScriptChecks  bool                   `json:"enable_script_checks,omitempty"`
 	ReadyTimeout        time.Duration          `json:"-"`
 	Stdout, Stderr      io.Writer              `json:"-"`
 	Args                []string               `json:"-"`
--- a/website/source/docs/agent/checks.html.md
+++ b/website/source/docs/agent/checks.html.md
@ -21,10 +21,12 @@ There are five different kinds of checks:
  that performs the health check, exits with an appropriate exit code, and potentially
  generates some output. A script is paired with an invocation interval (e.g.
  every 30 seconds). This is similar to the Nagios plugin system. The output of
-  a script check is limited to 4K. Output larger than this will be truncated.
+  a script check is limited to 4KB. Output larger than this will be truncated.
  By default, Script checks will be configured with a timeout equal to 30 seconds.
  It is possible to configure a custom Script check timeout value by specifying the
-  `timeout` field in the check definition.
+  `timeout` field in the check definition. In Consul 0.9.0 and later, the agent
+  must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
+  set to `true` in order to enable script checks.

 * HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
  every 30 seconds) to the specified URL. The status of the service depends on
@ -38,7 +40,7 @@ There are five different kinds of checks:
  configured with a request timeout equal to the check interval, with a max of
  10 seconds. It is possible to configure a custom HTTP check timeout value by
  specifying the `timeout` field in the check definition. The output of the
-  check is limited to roughly 4K. Responses larger than this will be truncated.
+  check is limited to roughly 4KB. Responses larger than this will be truncated.
  HTTP checks also support SSL. By default, a valid SSL certificate is expected.
  Certificate verification can be turned off by setting the `tls_skip_verify`
  field to `true` in the check definition.
@ -74,15 +76,17 @@ There are five different kinds of checks:
  valid through the end of the TTL from the time of the last check.

 * Docker + Interval - These checks depend on invoking an external application which
-is packaged within a Docker Container. The application is triggered within the running
-container via the Docker Exec API. We expect that the Consul agent user has access
-to either the Docker HTTP API or the unix socket. Consul uses ```$DOCKER_HOST``` to
-determine the Docker API endpoint. The application is expected to run, perform a health
-check of the service running inside the container, and exit with an appropriate exit code.
-The check should be paired with an invocation interval. The shell on which the check
-has to be performed is configurable which makes it possible to run containers which
-have different shells on the same host. Check output for Docker is limited to
-4K. Any output larger than this will be truncated.
+  is packaged within a Docker Container. The application is triggered within the running
+  container via the Docker Exec API. We expect that the Consul agent user has access
+  to either the Docker HTTP API or the unix socket. Consul uses ```$DOCKER_HOST``` to
+  determine the Docker API endpoint. The application is expected to run, perform a health
+  check of the service running inside the container, and exit with an appropriate exit code.
+  The check should be paired with an invocation interval. The shell on which the check
+  has to be performed is configurable which makes it possible to run containers which
+  have different shells on the same host. Check output for Docker is limited to
+  4KB. Any output larger than this will be truncated. In Consul 0.9.0 and later, the agent
+  must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
+  set to `true` in order to enable Docker health checks.

 ## Check Definition

@ -210,6 +214,10 @@ This is the only convention that Consul depends on. Any output of the script
 will be captured and stored in the `notes` field so that it can be viewed
 by human operators.

+In Consul 0.9.0 and later, the agent must be configured with
+[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
+in order to enable script checks.
+
 ## Initial Health Check Status

 By default, when checks are registered against a Consul agent, the state is set
--- a/website/source/docs/agent/options.html.md
+++ b/website/source/docs/agent/options.html.md
@ -147,6 +147,10 @@ will exit with an error at startup.
  [Nomad](https://www.nomadproject.io/), so if you opt-in to host-based IDs then Consul and Nomad will use
  information on the host to automatically assign the same ID in both systems.

+* <a name="_disable_keyring_file"></a><a href="#_disable_keyring_file">`-disable-keyring-file`</a> - If set,
+  the keyring will not be persisted to a file. Any installed keys will be lost on shutdown, and only the given
+  `-encrypt` key will be available on startup. This defaults to false.
+
 * <a name="_dns_port"></a><a href="#_dns_port">`-dns-port`</a> - the DNS port to listen on.
  This overrides the default port 8600. This is available in Consul 0.7 and later.

@ -154,6 +158,12 @@ will exit with an error at startup.
  in the "consul." domain. This flag can be used to change that domain. All queries in this domain
  are assumed to be handled by Consul and will not be recursively resolved.

+* <a name="_enable_script_checks"></a><a href="#_enable_script_checks">`enable-script-checks`</a> This
+  controls whether [health checks that execute scripts](/docs/agent/checks.html) are enabled on
+  this agent, and defaults to `false` so operators must opt-in to allowing these. If enabled,
+  it is recommended to [enable ACLs](/docs/guides/acl.html) as well to control which users are
+  allowed to register new checks to execute scripts. This was added in Consul 0.9.0.
+
 * <a name="_encrypt"></a><a href="#_encrypt">`-encrypt`</a> - Specifies the secret key to
  use for encryption of Consul
  network traffic. This key must be 16-bytes that are Base64-encoded. The
@ -167,10 +177,6 @@ will exit with an error at startup.
  initialized with an encryption key, then the provided key is ignored and
  a warning will be displayed.

-* <a name="_disable_keyring_file"></a><a href="#_disable_keyring_file">`-disable-keyring-file`</a> - If set,
-  the keyring will not be persisted to a file. Any installed keys will be lost on shutdown, and only the given
-  `-encrypt` key will be available on startup. This defaults to false.
-
 * <a name="_http_port"></a><a href="#_http_port">`-http-port`</a> - the HTTP API port to listen on.
  This overrides the default port 8500. This option is very useful when deploying Consul
  to an environment which communicates the HTTP port through the environment e.g. PaaS like CloudFoundry, allowing
@ -712,6 +718,9 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
 * <a name="enable_debug"></a><a href="#enable_debug">`enable_debug`</a> When set, enables some
  additional debugging features. Currently, this is only used to set the runtime profiling HTTP endpoints.

+* <a name="enable_script_checks"></a><a href="#enable_script_checks">`enable_script_checks`</a> Equivalent to the
+  [`-enable-script-checks` command-line flag](#_enable_script_checks).
+
 * <a name="enable_syslog"></a><a href="#enable_syslog">`enable_syslog`</a> Equivalent to
  the [`-syslog` command-line flag](#_syslog).

--- a/website/source/docs/guides/acl.html.md
+++ b/website/source/docs/guides/acl.html.md
@ -684,6 +684,10 @@ to use for registration events:
   [checks](/docs/agent/checks.html). Tokens may also be passed to the
   [HTTP API](/api/index.html) for operations that require them.

+In addition to ACLs, in Consul 0.9.0 and later, the agent must be configured with
+[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true` in order to enable
+script checks.
+
 #### Operator Rules

 The `operator` policy controls access to cluster-level operations in the
@ -866,6 +870,10 @@ to use for registration events:
   [checks](/docs/agent/checks.html). Tokens may also be passed to the
   [HTTP API](/api/index.html) for operations that require them.

+In addition to ACLs, in Consul 0.9.0 and later, the agent must be configured with
+[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true` in order to enable
+script checks.
+
 #### Session Rules

 The `session` policy controls access to [Session API](/api/session.html) operations.
--- a/website/source/intro/getting-started/join.html.md
+++ b/website/source/intro/getting-started/join.html.md
@ -72,6 +72,12 @@ the replicated log until the expected number of servers has successfully joined.
 You can read more about this in the [bootstrapping
 guide](/docs/guides/bootstrapping.html).

+We've included the [`-enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
+flag set to `true` in order to enable health checks that can execute external scripts.
+This will be used in examples later. For production use, you'd want to configure
+[ACLs](/docs/guides/acl.html) in conjunction with this to control the ability to
+register arbitrary scripts.
+
 Finally, we add the [`config-dir` flag](/docs/agent/options.html#_config_dir),
 marking where service and check definitions can be found.

@ -81,7 +87,7 @@ All together, these settings yield a
 ```text
 vagrant@n1:~$ consul agent -server -bootstrap-expect=1 \
 	-data-dir=/tmp/consul -node=agent-one -bind=172.20.20.10 \
-	-config-dir=/etc/consul.d
+	-enable-script-checks=true -config-dir=/etc/consul.d
 ...
 ```

@ -102,7 +108,7 @@ All together, these settings yield a

 ```text
 vagrant@n2:~$ consul agent -data-dir=/tmp/consul -node=agent-two \
-	-bind=172.20.20.11 -config-dir=/etc/consul.d
+	-bind=172.20.20.11 -enable-script-checks=true -config-dir=/etc/consul.d
 ...
 ```