From 335f0a5371ccf9364db5aace1b0c4586e8e6a482 Mon Sep 17 00:00:00 2001 From: Daniel Bennett Date: Thu, 2 Feb 2023 14:20:26 -0600 Subject: [PATCH] docs: how to troubleshoot consul connect envoy (#15908) * largely a doc-ification of this commit message: d47678074bf8ae9ff2da3c91d0729bf03aee8446 this doesn't spell out all the possible failure modes, but should be a good starting point for folks. * connect: add doc link to envoy bootstrap error * add Unwrap() to RecoverableError mainly for easier testing --- .changelog/15908.txt | 3 +++ .../taskrunner/envoy_bootstrap_hook.go | 9 ++++++- .../taskrunner/envoy_bootstrap_hook_test.go | 4 +-- nomad/structs/structs.go | 6 +++++ .../docs/integrations/consul-connect.mdx | 26 +++++++++++++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 .changelog/15908.txt diff --git a/.changelog/15908.txt b/.changelog/15908.txt new file mode 100644 index 000000000..22508a1ba --- /dev/null +++ b/.changelog/15908.txt @@ -0,0 +1,3 @@ +```release-note:improvement +docs: link to an envoy troubleshooting doc when envoy bootstrap fails +``` diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go index 0da4c3d0d..fd852a49a 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go @@ -43,6 +43,10 @@ const ( envoyBootstrapMaxJitter = 500 * time.Millisecond ) +var ( + errEnvoyBootstrapError = errors.New("error creating bootstrap configuration for Connect proxy sidecar") +) + type consulTransportConfig struct { HTTPAddr string // required Auth string // optional, env CONSUL_HTTP_AUTH @@ -373,7 +377,10 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart // Wrap the last error from Consul and set that as our status. _, recoverable := cmdErr.(*exec.ExitError) return structs.NewRecoverableError( - fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr), + fmt.Errorf("%w: %v; see: ", + errEnvoyBootstrapError, + cmdErr, + ), recoverable, ) } diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go index 4fcfb623b..344332960 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go @@ -672,7 +672,7 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) { // Run the hook err := h.Prestart(context.Background(), req, resp) - require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1") + require.ErrorIs(t, err, errEnvoyBootstrapError) require.True(t, structs.IsRecoverable(err)) // Assert it is not Done @@ -760,7 +760,7 @@ func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) { // Run the hook and get the error err := h.Prestart(context.Background(), req, &resp) - require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1") + require.ErrorIs(t, err, errEnvoyBootstrapError) // Current time should be at least start time + total wait time minimum := begin.Add(h.envoyBootstrapWaitTime) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index eb05ade65..7f8c0cf9d 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11906,6 +11906,7 @@ type KeyringRequest struct { type RecoverableError struct { Err string Recoverable bool + wrapped error } // NewRecoverableError is used to wrap an error and mark it as recoverable or @@ -11918,6 +11919,7 @@ func NewRecoverableError(e error, recoverable bool) error { return &RecoverableError{ Err: e.Error(), Recoverable: recoverable, + wrapped: e, } } @@ -11940,6 +11942,10 @@ func (r *RecoverableError) IsUnrecoverable() bool { return !r.Recoverable } +func (r *RecoverableError) Unwrap() error { + return r.wrapped +} + // Recoverable is an interface for errors to implement to indicate whether or // not they are fatal or recoverable. type Recoverable interface { diff --git a/website/content/docs/integrations/consul-connect.mdx b/website/content/docs/integrations/consul-connect.mdx index 34fa55cde..a20efebb3 100644 --- a/website/content/docs/integrations/consul-connect.mdx +++ b/website/content/docs/integrations/consul-connect.mdx @@ -371,6 +371,32 @@ dashes (`-`) are converted to underscores (`_`) in environment variables so - Prior to Consul 1.9, the Envoy sidecar proxy will drop and stop accepting connections while the Nomad agent is restarting. +## Troubleshooting + +If the sidecar service is not running correctly, you can investigate +potential `envoy` failures in the following ways: + +* Task logs in the associated `connect-*` task +* Task secrets (may contain sensitive information): + * envoy CLI command: `secrets/.envoy_bootstrap.cmd` + * environment variables: `secrets/.envoy_bootstrap.env` +* An extra Allocation log file: `alloc/logs/envoy_bootstrap.stderr.0` + +For example, with an allocation ID starting with `b36a`: + +```shell-session +nomad alloc status -short b36a # to get the connect-* task name +nomad alloc logs -task connect-proxy-count-api -stderr b36a +nomad alloc exec -task connect-proxy-count-api b36a cat secrets/.envoy_bootstrap.cmd +nomad alloc exec -task connect-proxy-count-api b36a cat secrets/.envoy_bootstrap.env +nomad alloc fs b36a alloc/logs/envoy_bootstrap.stderr.0 +``` + +Note: If the alloc is unable to start successfully, debugging files may +only be accessible from the host filesystem. However, the sidecar task secrets +directory may not be available in systems where it is mounted in a temporary +filesystem. + [count-dashboard]: /img/count-dashboard.png [consul_acl]: https://github.com/hashicorp/consul/issues/7414 [gh-9907]: https://github.com/hashicorp/nomad/issues/9907