troubleshoot: fixes and updated messages (#16294)

This commit is contained in:
Nitya Dhanushkodi 2023-02-17 07:43:05 -08:00 committed by GitHub
parent c66f9ebf39
commit 9d255fe057
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 135 additions and 80 deletions

View File

@ -55,7 +55,7 @@ func TestValidateUpstreams(t *testing.T) {
delete(ir.Index[xdscommon.ListenerType], listenerName)
return ir
},
err: "no listener for upstream \"db\"",
err: "No listener for upstream \"db\"",
},
{
name: "tcp-missing-cluster",
@ -66,7 +66,7 @@ func TestValidateUpstreams(t *testing.T) {
delete(ir.Index[xdscommon.ClusterType], sni)
return ir
},
err: "no cluster \"db.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul\" for upstream \"db\"",
err: "No cluster \"db.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul\" for upstream \"db\"",
},
{
name: "http-success",
@ -124,7 +124,7 @@ func TestValidateUpstreams(t *testing.T) {
delete(ir.Index[xdscommon.RouteType], "db")
return ir
},
err: "no route for upstream \"db\"",
err: "No route for upstream \"db\"",
},
{
name: "redirect",
@ -170,7 +170,7 @@ func TestValidateUpstreams(t *testing.T) {
delete(ir.Index[xdscommon.ClusterType], sni)
return ir
},
err: "no cluster \"google.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul\" for upstream \"240.0.0.1\"",
err: "No cluster \"google.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul\" for upstream \"240.0.0.1\"",
},
{
name: "tproxy-http-redirect-success",
@ -230,7 +230,9 @@ func TestValidateUpstreams(t *testing.T) {
var outputErrors string
for _, msgError := range messages.Errors() {
outputErrors += msgError.Message
outputErrors += msgError.PossibleActions
for _, action := range msgError.PossibleActions {
outputErrors += action
}
}
if len(tt.err) == 0 {
require.True(t, messages.Success())

View File

@ -77,12 +77,12 @@ func (c *cmd) Run(args []string) int {
t, err := troubleshoot.NewTroubleshoot(adminBindIP, adminPort)
if err != nil {
c.UI.Error("error generating troubleshoot client: " + err.Error())
c.UI.Error("Error generating troubleshoot client: " + err.Error())
return 1
}
messages, err := t.RunAllTests(c.upstreamEnvoyID, c.upstreamIP)
if err != nil {
c.UI.Error("error running the tests: " + err.Error())
c.UI.Error("Error running the tests: " + err.Error())
return 1
}
@ -92,11 +92,16 @@ func (c *cmd) Run(args []string) int {
c.UI.SuccessOutput(o.Message)
} else {
c.UI.ErrorOutput(o.Message)
if o.PossibleActions != "" {
c.UI.UnchangedOutput(o.PossibleActions)
for _, action := range o.PossibleActions {
c.UI.UnchangedOutput("-> " + action)
}
}
}
if messages.Success() {
c.UI.UnchangedOutput("If you are still experiencing issues, you can:")
c.UI.UnchangedOutput("-> Check intentions to ensure the upstream allows traffic from this source")
c.UI.UnchangedOutput("-> If using transparent proxy, ensure DNS resolution is to the same IP you have verified here")
}
return 0
}
@ -114,14 +119,15 @@ const (
Usage: consul troubleshoot proxy [options]
Connects to local envoy proxy and troubleshoots service mesh communication issues.
Requires an upstream service envoy identifier.
Requires an upstream service identifier. When debugging explicitly configured upstreams,
use -upstream-envoy-id, when debugging transparent proxy upstreams use -upstream-ip.
Examples:
(explicit upstreams only)
$ consul troubleshoot proxy -upstream-envoy-id foo
(transparent proxy only)
$ consul troubleshoot proxy -upstream-ip <IP>
$ consul troubleshoot proxy -upstream-ip 240.0.0.1
where 'foo' is the upstream envoy identifier which
where 'foo' is the upstream envoy identifier and '240.0.0.1' is an upstream ip which
can be obtained by running:
$ consul troubleshoot upstreams [options]
`

View File

@ -77,24 +77,24 @@ func (c *cmd) Run(args []string) int {
return 1
}
c.UI.Output(fmt.Sprintf("==> Upstreams (explicit upstreams only) (%v)", len(envoyIDs)))
c.UI.HeaderOutput(fmt.Sprintf("Upstreams (explicit upstreams only) (%v)\n", len(envoyIDs)))
for _, u := range envoyIDs {
c.UI.Output(u)
c.UI.UnchangedOutput(u)
}
c.UI.Output(fmt.Sprintf("\n==> Upstream IPs (transparent proxy only) (%v)", len(upstreamIPs)))
c.UI.HeaderOutput(fmt.Sprintf("Upstream IPs (transparent proxy only) (%v)", len(upstreamIPs)))
tbl := cli.NewTable("IPs ", "Virtual ", "Cluster Names")
for _, u := range upstreamIPs {
tbl.AddRow([]string{formatIPs(u.IPs), strconv.FormatBool(u.IsVirtual), formatClusterNames(u.ClusterNames)}, []string{})
}
c.UI.Table(tbl)
c.UI.Output("\nIf you don't see your upstream address or cluster for a transparent proxy upstream:")
c.UI.Output("- Check intentions: Tproxy upstreams are configured based on intentions, make sure you " +
c.UI.UnchangedOutput("\nIf you cannot find the upstream address or cluster for a transparent proxy upstream:")
c.UI.UnchangedOutput("-> Check intentions: Transparent proxy upstreams are configured based on intentions. Make sure you " +
"have configured intentions to allow traffic to your upstream.")
c.UI.Output("- You can also check that the right cluster is being dialed by running a DNS lookup " +
"for the upstream you are dialing (i.e dig backend.svc.consul). If the address you get from that is missing " +
"from the Upstream IPs your proxy may be misconfigured.")
c.UI.UnchangedOutput("-> To check that the right cluster is being dialed, run a DNS lookup " +
"for the upstream you are dialing. For example, run `dig backend.svc.consul` to return the IP address for the `backend` service. If the address you get from that is missing " +
"from the upstream IPs, it means that your proxy may be misconfigured.")
return 0
}

View File

@ -3,11 +3,12 @@ package troubleshoot
import (
"context"
"fmt"
"github.com/stretchr/testify/assert"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
libcluster "github.com/hashicorp/consul/test/integration/consul-container/libs/cluster"
@ -40,11 +41,12 @@ func TestTroubleshootProxy(t *testing.T) {
"-envoy-admin-endpoint", fmt.Sprintf("localhost:%v", clientAdminPort),
"-upstream-envoy-id", libservice.StaticServerServiceName})
require.NoError(t, err)
certsValid := strings.Contains(output, "certificates are valid")
listenersExist := strings.Contains(output, fmt.Sprintf("listener for upstream \"%s\" found", libservice.StaticServerServiceName))
routesExist := strings.Contains(output, fmt.Sprintf("route for upstream \"%s\" found", libservice.StaticServerServiceName))
healthyEndpoints := strings.Contains(output, "✓ healthy endpoints for cluster")
return upstreamExists && certsValid && listenersExist && routesExist && healthyEndpoints
certsValid := strings.Contains(output, "Certificates are valid")
noRejectedConfig := strings.Contains(output, "Envoy has 0 rejected configurations")
noConnFailure := strings.Contains(output, "Envoy has detected 0 connection failure(s)")
listenersExist := strings.Contains(output, fmt.Sprintf("Listener for upstream \"%s\" found", libservice.StaticServerServiceName))
healthyEndpoints := strings.Contains(output, "Healthy endpoints for cluster")
return upstreamExists && certsValid && listenersExist && noRejectedConfig && noConnFailure && healthyEndpoints
}, 60*time.Second, 10*time.Second)
})
@ -58,11 +60,12 @@ func TestTroubleshootProxy(t *testing.T) {
"-upstream-envoy-id", libservice.StaticServerServiceName})
require.NoError(t, err)
certsValid := strings.Contains(output, "certificates are valid")
listenersExist := strings.Contains(output, fmt.Sprintf("listener for upstream \"%s\" found", libservice.StaticServerServiceName))
routesExist := strings.Contains(output, fmt.Sprintf("route for upstream \"%s\" found", libservice.StaticServerServiceName))
endpointUnhealthy := strings.Contains(output, "no healthy endpoints for cluster")
return certsValid && listenersExist && routesExist && endpointUnhealthy
certsValid := strings.Contains(output, "Certificates are valid")
noRejectedConfig := strings.Contains(output, "Envoy has 0 rejected configurations")
noConnFailure := strings.Contains(output, "Envoy has detected 0 connection failure(s)")
listenersExist := strings.Contains(output, fmt.Sprintf("Listener for upstream \"%s\" found", libservice.StaticServerServiceName))
endpointUnhealthy := strings.Contains(output, "No healthy endpoints for cluster")
return certsValid && listenersExist && noRejectedConfig && noConnFailure && endpointUnhealthy
}, 60*time.Second, 10*time.Second)
})
}

View File

@ -18,7 +18,10 @@ func (t *Troubleshoot) validateCerts(certs *envoy_admin_v3.Certificates) validat
if certs == nil {
msg := validate.Message{
Success: false,
Message: "certificate object is nil in the proxy configuration",
Message: "Certificate object is nil in the proxy configuration",
PossibleActions: []string{
"Check the logs of the Consul agent configuring the local proxy and ensure XDS updates are being sent to the proxy",
},
}
return []validate.Message{msg}
}
@ -26,7 +29,10 @@ func (t *Troubleshoot) validateCerts(certs *envoy_admin_v3.Certificates) validat
if len(certs.GetCertificates()) == 0 {
msg := validate.Message{
Success: false,
Message: "no certificates found",
Message: "No certificates found",
PossibleActions: []string{
"Check the logs of the Consul agent configuring the local proxy and ensure XDS updates are being sent to the proxy",
},
}
return []validate.Message{msg}
}
@ -36,7 +42,10 @@ func (t *Troubleshoot) validateCerts(certs *envoy_admin_v3.Certificates) validat
if now.After(cacert.GetExpirationTime().AsTime()) {
msg := validate.Message{
Success: false,
Message: "ca certificate is expired",
Message: "CA certificate is expired",
PossibleActions: []string{
"Check the logs of the Consul agent configuring the local proxy and ensure XDS updates are being sent to the proxy",
},
}
certMessages = append(certMessages, msg)
}
@ -46,7 +55,10 @@ func (t *Troubleshoot) validateCerts(certs *envoy_admin_v3.Certificates) validat
if now.After(cc.GetExpirationTime().AsTime()) {
msg := validate.Message{
Success: false,
Message: "certificate chain is expired",
Message: "Certificate chain is expired",
PossibleActions: []string{
"Check the logs of the Consul agent configuring the local proxy and ensure XDS updates are being sent to the proxy",
},
}
certMessages = append(certMessages, msg)
}

View File

@ -21,13 +21,13 @@ func TestValidateCerts(t *testing.T) {
}{
"cert is nil": {
certs: nil,
expectedError: "certificate object is nil in the proxy configuration",
expectedError: "Certificate object is nil in the proxy configuration",
},
"no certificates": {
certs: &envoy_admin_v3.Certificates{
Certificates: []*envoy_admin_v3.Certificate{},
},
expectedError: "no certificates found",
expectedError: "No certificates found",
},
"ca expired": {
certs: &envoy_admin_v3.Certificates{
@ -41,7 +41,7 @@ func TestValidateCerts(t *testing.T) {
},
},
},
expectedError: "ca certificate is expired",
expectedError: "CA certificate is expired",
},
"cert expired": {
certs: &envoy_admin_v3.Certificates{
@ -55,7 +55,7 @@ func TestValidateCerts(t *testing.T) {
},
},
},
expectedError: "certificate chain is expired",
expectedError: "Certificate chain is expired",
},
}
@ -67,7 +67,9 @@ func TestValidateCerts(t *testing.T) {
var outputErrors string
for _, msgError := range messages.Errors() {
outputErrors += msgError.Message
outputErrors += msgError.PossibleActions
for _, action := range msgError.PossibleActions {
outputErrors += action
}
}
if tc.expectedError == "" {
require.True(t, messages.Success())

View File

@ -3,6 +3,7 @@ package troubleshoot
import (
"encoding/json"
"fmt"
envoy_admin_v3 "github.com/envoyproxy/go-control-plane/envoy/admin/v3"
"github.com/hashicorp/consul/troubleshoot/validate"
)
@ -32,10 +33,19 @@ func (t *Troubleshoot) troubleshootStats() (validate.Messages, error) {
}
}
if totalConfigRejections > 0 {
statMessages = append(statMessages, validate.Message{
Message: fmt.Sprintf("Envoy has %v rejected configurations", totalConfigRejections),
PossibleActions: []string{
"Check the logs of the Consul agent configuring the local proxy to see why Envoy rejected this configuration",
},
})
} else {
statMessages = append(statMessages, validate.Message{
Success: true,
Message: fmt.Sprintf("Envoy has %v rejected configurations", totalConfigRejections),
})
}
connectionFailureStats, err := t.getEnvoyStats("upstream_cx_connect_fail")
if err != nil {
@ -50,7 +60,7 @@ func (t *Troubleshoot) troubleshootStats() (validate.Messages, error) {
}
statMessages = append(statMessages, validate.Message{
Success: true,
Message: fmt.Sprintf("Envoy has detected %v connection failure(s).", totalCxFailures),
Message: fmt.Sprintf("Envoy has detected %v connection failure(s)", totalCxFailures),
})
return statMessages, nil
}

View File

@ -10,15 +10,6 @@ import (
"github.com/hashicorp/consul/troubleshoot/validate"
)
const (
listeners string = "type.googleapis.com/envoy.admin.v3.ListenersConfigDump"
clusters string = "type.googleapis.com/envoy.admin.v3.ClustersConfigDump"
routes string = "type.googleapis.com/envoy.admin.v3.RoutesConfigDump"
endpoints string = "type.googleapis.com/envoy.admin.v3.EndpointsConfigDump"
bootstrap string = "type.googleapis.com/envoy.admin.v3.BootstrapConfigDump"
httpConnManager string = "type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager"
)
type Troubleshoot struct {
client *api.Client
envoyAddr net.IPAddr
@ -79,7 +70,7 @@ func (t *Troubleshoot) RunAllTests(upstreamEnvoyID, upstreamIP string) (validate
if errors := messages.Errors(); len(errors) == 0 {
msg := validate.Message{
Success: true,
Message: "certificates are valid",
Message: "Certificates are valid",
}
allTestMessages = append(allTestMessages, msg)
}
@ -97,7 +88,7 @@ func (t *Troubleshoot) RunAllTests(upstreamEnvoyID, upstreamIP string) (validate
if errors := messages.Errors(); len(errors) == 0 {
msg := validate.Message{
Success: true,
Message: "upstream resources are valid",
Message: "Upstream resources are valid",
}
allTestMessages = append(allTestMessages, msg)
}

View File

@ -2,6 +2,7 @@ package troubleshoot
import (
"fmt"
envoy_admin_v3 "github.com/envoyproxy/go-control-plane/envoy/admin/v3"
envoy_listener_v3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
envoy_route_v3 "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
@ -29,7 +30,7 @@ func (t *Troubleshoot) GetUpstreams() ([]string, []UpstreamIP, error) {
for _, cfg := range t.envoyConfigDump.Configs {
switch cfg.TypeUrl {
case listeners:
case listenersType:
lcd := &envoy_admin_v3.ListenersConfigDump{}
err := proto.Unmarshal(cfg.GetValue(), lcd)
@ -135,7 +136,7 @@ func getClustersFromRoutes(routeName string, cfgDump *envoy_admin_v3.ConfigDump)
for _, cfg := range cfgDump.Configs {
switch cfg.TypeUrl {
case routes:
case routesType:
rcd := &envoy_admin_v3.RoutesConfigDump{}
err := proto.Unmarshal(cfg.GetValue(), rcd)

View File

@ -1,14 +1,15 @@
package troubleshoot
import (
"io"
"os"
"testing"
envoy_admin_v3 "github.com/envoyproxy/go-control-plane/envoy/admin/v3"
envoy_listener_v3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/proto"
"io"
"os"
"testing"
)
func TestGetUpstreamIPsFromFilterChain(t *testing.T) {
@ -61,7 +62,7 @@ func TestGetUpstreamIPsFromFilterChain(t *testing.T) {
for _, cfg := range cfgDump.Configs {
switch cfg.TypeUrl {
case listeners:
case listenersType:
lcd := &envoy_admin_v3.ListenersConfigDump{}
err := proto.Unmarshal(cfg.GetValue(), lcd)

View File

@ -110,7 +110,7 @@ type Messages []Message
type Message struct {
Success bool
Message string
PossibleActions string
PossibleActions []string
}
func (m Messages) Success() bool {
@ -137,6 +137,18 @@ func (m Messages) Errors() Messages {
// GetMessages returns the error based only on Validate's state.
func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator EndpointValidator, clusters *envoy_admin_v3.Clusters) Messages {
var messages Messages
missingXDSActions := []string{
"Check that your upstream service is registered with Consul",
"Make sure your upstream exists by running the `consul[-k8s] troubleshoot upstreams` command",
"If you are using transparent proxy for this upstream, ensure you have set up allow intentions to the upstream",
"Check the logs of the Consul agent configuring the local proxy to ensure XDS resources were sent by Consul",
}
missingEndpointsActions := []string{
"Check that your upstream service is healthy and running",
"Check that your upstream service is registered with Consul",
"Check that the upstream proxy is healthy and running",
"If you are explicitly configuring upstreams, ensure the name of the upstream is correct",
}
var upstream string
upstream = v.envoyID
@ -145,19 +157,25 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
}
if !v.listener {
messages = append(messages, Message{Message: fmt.Sprintf("no listener for upstream %q", upstream)})
messages = append(messages, Message{
Message: fmt.Sprintf("No listener for upstream %q", upstream),
PossibleActions: missingXDSActions,
})
} else {
messages = append(messages, Message{
Message: fmt.Sprintf("listener for upstream %q found", upstream),
Message: fmt.Sprintf("Listener for upstream %q found", upstream),
Success: true,
})
}
if v.usesRDS && !v.route {
messages = append(messages, Message{Message: fmt.Sprintf("no route for upstream %q", upstream)})
} else {
messages = append(messages, Message{
Message: fmt.Sprintf("route for upstream %q found", upstream),
Message: fmt.Sprintf("No route for upstream %q", upstream),
PossibleActions: missingXDSActions,
})
} else if v.route {
messages = append(messages, Message{
Message: fmt.Sprintf("Route for upstream %q found", upstream),
Success: true,
})
}
@ -173,11 +191,14 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
_, ok := v.snis[sni]
if !ok || !resource.cluster {
messages = append(messages, Message{Message: fmt.Sprintf("no cluster %q for upstream %q", sni, upstream)})
messages = append(messages, Message{
Message: fmt.Sprintf("No cluster %q for upstream %q", sni, upstream),
PossibleActions: missingXDSActions,
})
continue
} else {
messages = append(messages, Message{
Message: fmt.Sprintf("cluster %q for upstream %q found", sni, upstream),
Message: fmt.Sprintf("Cluster %q for upstream %q found", sni, upstream),
Success: true,
})
}
@ -186,7 +207,7 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
// validation.
if strings.Contains(sni, "passthrough~") {
messages = append(messages, Message{
Message: fmt.Sprintf("cluster %q is a passthrough cluster, skipping endpoint healthiness check", sni),
Message: fmt.Sprintf("Cluster %q is a passthrough cluster, skipping endpoint healthiness check", sni),
Success: true,
})
continue
@ -206,10 +227,13 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
}
}
if !oneClusterHasEndpoints {
messages = append(messages, Message{Message: fmt.Sprintf("no healthy endpoints for aggregate cluster %q for upstream %q", sni, upstream)})
messages = append(messages, Message{
Message: fmt.Sprintf("No healthy endpoints for aggregate cluster %q for upstream %q", sni, upstream),
PossibleActions: missingEndpointsActions,
})
} else {
messages = append(messages, Message{
Message: fmt.Sprintf("healthy endpoints for aggregate cluster %q for upstream %q", sni, upstream),
Message: fmt.Sprintf("Healthy endpoints for aggregate cluster %q for upstream %q found", sni, upstream),
Success: true,
})
}
@ -218,11 +242,12 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
endpointValidator(resource, sni, clusters)
if (resource.usesEDS && !resource.loadAssignment) || resource.endpoints == 0 {
messages = append(messages, Message{
Message: fmt.Sprintf("no healthy endpoints for cluster %q for upstream %q", sni, upstream),
Message: fmt.Sprintf("No healthy endpoints for cluster %q for upstream %q", sni, upstream),
PossibleActions: missingEndpointsActions,
})
} else {
messages = append(messages, Message{
Message: fmt.Sprintf("healthy endpoints for cluster %q for upstream %q", sni, upstream),
Message: fmt.Sprintf("Healthy endpoints for cluster %q for upstream %q found", sni, upstream),
Success: true,
})
}
@ -235,7 +260,7 @@ func (v *Validate) GetMessages(validateEndpoints bool, endpointValidator Endpoin
}
if numRequiredResources == 0 {
messages = append(messages, Message{Message: fmt.Sprintf("no clusters found on route or listener")})
messages = append(messages, Message{Message: fmt.Sprintf("No clusters found on route or listener")})
}
return messages

View File

@ -63,7 +63,7 @@ func TestErrors(t *testing.T) {
r.loadAssignment = true
r.endpoints = 1
},
err: "no clusters found on route or listener",
err: "No clusters found on route or listener",
},
"no healthy endpoints": {
validate: func() *Validate {
@ -86,7 +86,7 @@ func TestErrors(t *testing.T) {
endpointValidator: func(r *resource, s string, clusters *envoy_admin_v3.Clusters) {
r.loadAssignment = true
},
err: "no healthy endpoints for cluster \"db-sni\" for upstream \"db\"",
err: "No healthy endpoints for cluster \"db-sni\" for upstream \"db\"",
},
"success: aggregate cluster with one target with endpoints": {
validate: func() *Validate {
@ -169,7 +169,7 @@ func TestErrors(t *testing.T) {
r.loadAssignment = true
r.endpoints = 0
},
err: "no healthy endpoints for aggregate cluster \"db-sni\" for upstream \"db\"",
err: "No healthy endpoints for aggregate cluster \"db-sni\" for upstream \"db\"",
},
"success: passthrough cluster doesn't error even though there are zero endpoints": {
validate: func() *Validate {
@ -203,7 +203,9 @@ func TestErrors(t *testing.T) {
var outputErrors string
for _, msgError := range messages.Errors() {
outputErrors += msgError.Message
outputErrors += msgError.PossibleActions
for _, action := range msgError.PossibleActions {
outputErrors += action
}
}
if tc.err == "" {
require.True(t, messages.Success())