Special case the error returned when we have a Raft leader but are not tracking it in the ServerLookup (#9487)
This can happen when one other node in the cluster such as a client is unable to communicate with the leader server and sees it as failed. When that happens its failing status eventually gets propagated to the other servers in the cluster and eventually this can result in RPCs returning “No cluster leader” error. That error is misleading and unhelpful for determing the root cause of the issue as its not raft stability but rather and client -> server networking issue. Therefore this commit will add a new error that will be returned in that case to differentiate between the two cases.
This commit is contained in:
parent
1e58b31098
commit
3a79b559f9
|
@ -593,7 +593,7 @@ CHECK_LEADER:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the leader
|
// Find the leader
|
||||||
isLeader, leader := s.getLeader()
|
isLeader, leader, rpcErr := s.getLeader()
|
||||||
|
|
||||||
// Handle the case we are the leader
|
// Handle the case we are the leader
|
||||||
if isLeader {
|
if isLeader {
|
||||||
|
@ -601,7 +601,6 @@ CHECK_LEADER:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle the case of a known leader
|
// Handle the case of a known leader
|
||||||
rpcErr := structs.ErrNoLeader
|
|
||||||
if leader != nil {
|
if leader != nil {
|
||||||
rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr,
|
rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr,
|
||||||
method, args, reply)
|
method, args, reply)
|
||||||
|
@ -632,24 +631,38 @@ RETRY:
|
||||||
|
|
||||||
// getLeader returns if the current node is the leader, and if not then it
|
// getLeader returns if the current node is the leader, and if not then it
|
||||||
// returns the leader which is potentially nil if the cluster has not yet
|
// returns the leader which is potentially nil if the cluster has not yet
|
||||||
// elected a leader.
|
// elected a leader. In the case of not having a leader elected yet
|
||||||
func (s *Server) getLeader() (bool, *metadata.Server) {
|
// then a NoClusterLeader error gets returned. In the case of Raft having
|
||||||
|
// a leader but out internal tracking failing to find the leader we
|
||||||
|
// return a LeaderNotTracked error. Therefore if the err is nil AND
|
||||||
|
// the bool is false then the Server will be non-nil
|
||||||
|
func (s *Server) getLeader() (bool, *metadata.Server, error) {
|
||||||
// Check if we are the leader
|
// Check if we are the leader
|
||||||
if s.IsLeader() {
|
if s.IsLeader() {
|
||||||
return true, nil
|
return true, nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the leader
|
// Get the leader
|
||||||
leader := s.raft.Leader()
|
leader := s.raft.Leader()
|
||||||
if leader == "" {
|
if leader == "" {
|
||||||
return false, nil
|
return false, nil, structs.ErrNoLeader
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lookup the server
|
// Lookup the server
|
||||||
server := s.serverLookup.Server(leader)
|
server := s.serverLookup.Server(leader)
|
||||||
|
|
||||||
// Server could be nil
|
// if server is nil this indicates that while we have a Raft leader
|
||||||
return false, server
|
// something has caused that node to be considered unhealthy which
|
||||||
|
// cascades into its removal from the serverLookup struct. In this case
|
||||||
|
// we should not report no cluster leader but instead report a different
|
||||||
|
// error so as not to confuse our users as to the what the root cause of
|
||||||
|
// an issue might be.
|
||||||
|
if server == nil {
|
||||||
|
s.logger.Warn("Raft has a leader but other tracking of the node would indicate that the node is unhealthy or does not exist. The network may be misconfigured.", "leader", leader)
|
||||||
|
return false, nil, structs.ErrLeaderNotTracked
|
||||||
|
}
|
||||||
|
|
||||||
|
return false, server, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// forwardDC is used to forward an RPC call to a remote DC, or fail if no servers
|
// forwardDC is used to forward an RPC call to a remote DC, or fail if no servers
|
||||||
|
|
|
@ -3,6 +3,7 @@ package consul
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"errors"
|
||||||
"math"
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
|
@ -130,7 +131,7 @@ func TestRPC_NoLeader_Retry(t *testing.T) {
|
||||||
|
|
||||||
// This isn't sure-fire but tries to check that we don't have a
|
// This isn't sure-fire but tries to check that we don't have a
|
||||||
// leader going into the RPC, so we exercise the retry logic.
|
// leader going into the RPC, so we exercise the retry logic.
|
||||||
if ok, _ := s1.getLeader(); ok {
|
if ok, _, _ := s1.getLeader(); ok {
|
||||||
t.Fatalf("should not have a leader yet")
|
t.Fatalf("should not have a leader yet")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -142,6 +143,54 @@ func TestRPC_NoLeader_Retry(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRPC_getLeader_ErrLeaderNotTracked(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("too slow for testing.Short")
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster := newTestCluster(t, &testClusterConfig{
|
||||||
|
Datacenter: "dc1",
|
||||||
|
Servers: 3,
|
||||||
|
ServerWait: func(t *testing.T, srv *Server) {
|
||||||
|
// The test cluster waits for a leader to be established
|
||||||
|
// but not for all the RPC tracking of all servers to be updated
|
||||||
|
// so we also want to wait for that here
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
if !srv.IsLeader() {
|
||||||
|
_, _, err := srv.getLeader()
|
||||||
|
require.NoError(r, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
// At this point we know we have a cluster with a leader and all followers are tracking that
|
||||||
|
// leader in the serverLookup struct. We need to find a follower to hack its server lookup
|
||||||
|
// to force the error we desire
|
||||||
|
|
||||||
|
var follower *Server
|
||||||
|
for _, srv := range cluster.Servers {
|
||||||
|
if !srv.IsLeader() {
|
||||||
|
follower = srv
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, leaderMeta, err := follower.getLeader()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// now do some behind the scenes trickery on the followers server lookup
|
||||||
|
// to remove the leader from it so that we can force a ErrLeaderNotTracked error
|
||||||
|
follower.serverLookup.RemoveServer(leaderMeta)
|
||||||
|
|
||||||
|
isLeader, meta, err := follower.getLeader()
|
||||||
|
require.Error(t, err)
|
||||||
|
require.True(t, errors.Is(err, structs.ErrLeaderNotTracked))
|
||||||
|
require.Nil(t, meta)
|
||||||
|
require.False(t, isLeader)
|
||||||
|
}
|
||||||
|
|
||||||
type MockSink struct {
|
type MockSink struct {
|
||||||
*bytes.Buffer
|
*bytes.Buffer
|
||||||
cancel bool
|
cancel bool
|
||||||
|
|
|
@ -48,9 +48,9 @@ func (s *Server) dispatchSnapshotRequest(args *structs.SnapshotRequest, in io.Re
|
||||||
|
|
||||||
// Perform leader forwarding if required.
|
// Perform leader forwarding if required.
|
||||||
if !args.AllowStale {
|
if !args.AllowStale {
|
||||||
if isLeader, server := s.getLeader(); !isLeader {
|
if isLeader, server, err := s.getLeader(); !isLeader {
|
||||||
if server == nil {
|
if err != nil {
|
||||||
return nil, structs.ErrNoLeader
|
return nil, err
|
||||||
}
|
}
|
||||||
return SnapshotRPC(s.connPool, args.Datacenter, server.ShortName, server.Addr, args, in, reply)
|
return SnapshotRPC(s.connPool, args.Datacenter, server.ShortName, server.Addr, args, in, reply)
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ const (
|
||||||
errRPCRateExceeded = "RPC rate limit exceeded"
|
errRPCRateExceeded = "RPC rate limit exceeded"
|
||||||
errServiceNotFound = "Service not found: "
|
errServiceNotFound = "Service not found: "
|
||||||
errQueryNotFound = "Query not found"
|
errQueryNotFound = "Query not found"
|
||||||
|
errLeaderNotTracked = "Raft leader not found in server lookup mapping"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -26,6 +27,7 @@ var (
|
||||||
ErrRPCRateExceeded = errors.New(errRPCRateExceeded)
|
ErrRPCRateExceeded = errors.New(errRPCRateExceeded)
|
||||||
ErrDCNotAvailable = errors.New(errDCNotAvailable)
|
ErrDCNotAvailable = errors.New(errDCNotAvailable)
|
||||||
ErrQueryNotFound = errors.New(errQueryNotFound)
|
ErrQueryNotFound = errors.New(errQueryNotFound)
|
||||||
|
ErrLeaderNotTracked = errors.New(errLeaderNotTracked)
|
||||||
)
|
)
|
||||||
|
|
||||||
func IsErrNoDCPath(err error) bool {
|
func IsErrNoDCPath(err error) bool {
|
||||||
|
|
Loading…
Reference in New Issue