docs: how to troubleshoot consul connect envoy (#15908)

* largely a doc-ification of this commit message:
  d47678074bf8ae9ff2da3c91d0729bf03aee8446
  this doesn't spell out all the possible failure modes,
  but should be a good starting point for folks.

* connect: add doc link to envoy bootstrap error

* add Unwrap() to RecoverableError
  mainly for easier testing
This commit is contained in:
Daniel Bennett 2023-02-02 14:20:26 -06:00 committed by GitHub
parent dec41f7f01
commit 335f0a5371
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 45 additions and 3 deletions

3
.changelog/15908.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
docs: link to an envoy troubleshooting doc when envoy bootstrap fails
```

View File

@ -43,6 +43,10 @@ const (
envoyBootstrapMaxJitter = 500 * time.Millisecond
)
var (
errEnvoyBootstrapError = errors.New("error creating bootstrap configuration for Connect proxy sidecar")
)
type consulTransportConfig struct {
HTTPAddr string // required
Auth string // optional, env CONSUL_HTTP_AUTH
@ -373,7 +377,10 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
// Wrap the last error from Consul and set that as our status.
_, recoverable := cmdErr.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr),
fmt.Errorf("%w: %v; see: <https://www.nomadproject.io/s/envoy-bootstrap-error>",
errEnvoyBootstrapError,
cmdErr,
),
recoverable,
)
}

View File

@ -672,7 +672,7 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
// Run the hook
err := h.Prestart(context.Background(), req, resp)
require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1")
require.ErrorIs(t, err, errEnvoyBootstrapError)
require.True(t, structs.IsRecoverable(err))
// Assert it is not Done
@ -760,7 +760,7 @@ func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) {
// Run the hook and get the error
err := h.Prestart(context.Background(), req, &resp)
require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1")
require.ErrorIs(t, err, errEnvoyBootstrapError)
// Current time should be at least start time + total wait time
minimum := begin.Add(h.envoyBootstrapWaitTime)

View File

@ -11906,6 +11906,7 @@ type KeyringRequest struct {
type RecoverableError struct {
Err string
Recoverable bool
wrapped error
}
// NewRecoverableError is used to wrap an error and mark it as recoverable or
@ -11918,6 +11919,7 @@ func NewRecoverableError(e error, recoverable bool) error {
return &RecoverableError{
Err: e.Error(),
Recoverable: recoverable,
wrapped: e,
}
}
@ -11940,6 +11942,10 @@ func (r *RecoverableError) IsUnrecoverable() bool {
return !r.Recoverable
}
func (r *RecoverableError) Unwrap() error {
return r.wrapped
}
// Recoverable is an interface for errors to implement to indicate whether or
// not they are fatal or recoverable.
type Recoverable interface {

View File

@ -371,6 +371,32 @@ dashes (`-`) are converted to underscores (`_`) in environment variables so
- Prior to Consul 1.9, the Envoy sidecar proxy will drop and stop accepting
connections while the Nomad agent is restarting.
## Troubleshooting
If the sidecar service is not running correctly, you can investigate
potential `envoy` failures in the following ways:
* Task logs in the associated `connect-*` task
* Task secrets (may contain sensitive information):
* envoy CLI command: `secrets/.envoy_bootstrap.cmd`
* environment variables: `secrets/.envoy_bootstrap.env`
* An extra Allocation log file: `alloc/logs/envoy_bootstrap.stderr.0`
For example, with an allocation ID starting with `b36a`:
```shell-session
nomad alloc status -short b36a # to get the connect-* task name
nomad alloc logs -task connect-proxy-count-api -stderr b36a
nomad alloc exec -task connect-proxy-count-api b36a cat secrets/.envoy_bootstrap.cmd
nomad alloc exec -task connect-proxy-count-api b36a cat secrets/.envoy_bootstrap.env
nomad alloc fs b36a alloc/logs/envoy_bootstrap.stderr.0
```
Note: If the alloc is unable to start successfully, debugging files may
only be accessible from the host filesystem. However, the sidecar task secrets
directory may not be available in systems where it is mounted in a temporary
filesystem.
[count-dashboard]: /img/count-dashboard.png
[consul_acl]: https://github.com/hashicorp/consul/issues/7414
[gh-9907]: https://github.com/hashicorp/nomad/issues/9907