areas: make the gRPC server tracker network area aware (#11748)

Fixes a bug whereby servers present in multiple network areas would be
properly segmented in the Router, but not in the gRPC mirror. This would
lead servers in the current datacenter leaving from a network area
(possibly during the network area's removal) from deleting their own
records that still exist in the standard WAN area.

The gRPC client stack uses the gRPC server tracker to execute all RPCs,
even those targeting members of the current datacenter (which is unlike
the net/rpc stack which has a bypass mechanism).

This would manifest as a gRPC method call never opening a socket because
it would block forever waiting for the current datacenter's pool of
servers to be non-empty.
This commit is contained in:
R.B. Boyer 2021-12-06 09:55:54 -06:00 committed by GitHub
parent d86b98c503
commit 80422c0dfe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 76 additions and 41 deletions

3
.changelog/11748.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
areas: **(Enterprise only)** make the gRPC server tracker network area aware
```

View File

@ -20,6 +20,7 @@ import (
"github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/sdk/freeport"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/consul/types"
)
// useTLSForDcAlwaysTrue tell GRPC to always return the TLS is enabled
@ -33,7 +34,7 @@ func TestNewDialer_WithTLSWrapper(t *testing.T) {
t.Cleanup(logError(t, lis.Close))
builder := resolver.NewServerResolverBuilder(newConfig(t))
builder.AddServer(&metadata.Server{
builder.AddServer(types.AreaWAN, &metadata.Server{
Name: "server-1",
ID: "ID1",
Datacenter: "dc1",
@ -84,14 +85,14 @@ func TestNewDialer_WithALPNWrapper(t *testing.T) {
}()
builder := resolver.NewServerResolverBuilder(newConfig(t))
builder.AddServer(&metadata.Server{
builder.AddServer(types.AreaWAN, &metadata.Server{
Name: "server-1",
ID: "ID1",
Datacenter: "dc1",
Addr: lis1.Addr(),
UseTLS: true,
})
builder.AddServer(&metadata.Server{
builder.AddServer(types.AreaWAN, &metadata.Server{
Name: "server-2",
ID: "ID2",
Datacenter: "dc2",
@ -153,7 +154,7 @@ func TestNewDialer_IntegrationWithTLSEnabledHandler(t *testing.T) {
srv := newTestServer(t, "server-1", "dc1", tlsConf)
md := srv.Metadata()
res.AddServer(md)
res.AddServer(types.AreaWAN, md)
t.Cleanup(srv.shutdown)
pool := NewClientConnPool(ClientConnPoolConfig{
@ -211,7 +212,7 @@ func TestNewDialer_IntegrationWithTLSEnabledHandler_viaMeshGateway(t *testing.T)
}()
md := srv.Metadata()
res.AddServer(md)
res.AddServer(types.AreaWAN, md)
t.Cleanup(srv.shutdown)
clientTLSConf, err := tlsutil.NewConfigurator(tlsutil.Config{
@ -266,7 +267,7 @@ func TestClientConnPool_IntegrationWithGRPCResolver_Failover(t *testing.T) {
for i := 0; i < count; i++ {
name := fmt.Sprintf("server-%d", i)
srv := newTestServer(t, name, "dc1", nil)
res.AddServer(srv.Metadata())
res.AddServer(types.AreaWAN, srv.Metadata())
t.Cleanup(srv.shutdown)
}
@ -280,7 +281,7 @@ func TestClientConnPool_IntegrationWithGRPCResolver_Failover(t *testing.T) {
first, err := client.Something(ctx, &testservice.Req{})
require.NoError(t, err)
res.RemoveServer(&metadata.Server{ID: first.ServerName, Datacenter: "dc1"})
res.RemoveServer(types.AreaWAN, &metadata.Server{ID: first.ServerName, Datacenter: "dc1"})
resp, err := client.Something(ctx, &testservice.Req{})
require.NoError(t, err)
@ -302,7 +303,7 @@ func TestClientConnPool_ForwardToLeader_Failover(t *testing.T) {
for i := 0; i < count; i++ {
name := fmt.Sprintf("server-%d", i)
srv := newTestServer(t, name, "dc1", nil)
res.AddServer(srv.Metadata())
res.AddServer(types.AreaWAN, srv.Metadata())
servers = append(servers, srv)
t.Cleanup(srv.shutdown)
}
@ -352,7 +353,7 @@ func TestClientConnPool_IntegrationWithGRPCResolver_Rebalance(t *testing.T) {
for i := 0; i < count; i++ {
name := fmt.Sprintf("server-%d", i)
srv := newTestServer(t, name, "dc1", nil)
res.AddServer(srv.Metadata())
res.AddServer(types.AreaWAN, srv.Metadata())
t.Cleanup(srv.shutdown)
}
@ -406,7 +407,7 @@ func TestClientConnPool_IntegrationWithGRPCResolver_MultiDC(t *testing.T) {
for _, dc := range dcs {
name := "server-0-" + dc
srv := newTestServer(t, name, dc, nil)
res.AddServer(srv.Metadata())
res.AddServer(types.AreaWAN, srv.Metadata())
t.Cleanup(srv.shutdown)
}

View File

@ -10,6 +10,7 @@ import (
"google.golang.org/grpc/resolver"
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/types"
)
// ServerResolverBuilder tracks the current server list and keeps any
@ -18,9 +19,9 @@ type ServerResolverBuilder struct {
cfg Config
// leaderResolver is used to track the address of the leader in the local DC.
leaderResolver leaderResolver
// servers is an index of Servers by Server.ID. The map contains server IDs
// servers is an index of Servers by area and Server.ID. The map contains server IDs
// for all datacenters.
servers map[string]*metadata.Server
servers map[types.AreaID]map[string]*metadata.Server
// resolvers is an index of connections to the serverResolver which manages
// addresses of servers for that connection.
resolvers map[resolver.ClientConn]*serverResolver
@ -37,7 +38,7 @@ type Config struct {
func NewServerResolverBuilder(cfg Config) *ServerResolverBuilder {
return &ServerResolverBuilder{
cfg: cfg,
servers: make(map[string]*metadata.Server),
servers: make(map[types.AreaID]map[string]*metadata.Server),
resolvers: make(map[resolver.ClientConn]*serverResolver),
}
}
@ -72,11 +73,13 @@ func (s *ServerResolverBuilder) ServerForGlobalAddr(globalAddr string) (*metadat
s.lock.RLock()
defer s.lock.RUnlock()
for _, server := range s.servers {
for _, areaServers := range s.servers {
for _, server := range areaServers {
if DCPrefix(server.Datacenter, server.Addr.String()) == globalAddr {
return server, nil
}
}
}
return nil, fmt.Errorf("failed to find Consul server for global address %q", globalAddr)
}
@ -138,11 +141,17 @@ func (s *ServerResolverBuilder) Authority() string {
}
// AddServer updates the resolvers' states to include the new server's address.
func (s *ServerResolverBuilder) AddServer(server *metadata.Server) {
func (s *ServerResolverBuilder) AddServer(areaID types.AreaID, server *metadata.Server) {
s.lock.Lock()
defer s.lock.Unlock()
s.servers[uniqueID(server)] = server
areaServers, ok := s.servers[areaID]
if !ok {
areaServers = make(map[string]*metadata.Server)
s.servers[areaID] = areaServers
}
areaServers[uniqueID(server)] = server
addrs := s.getDCAddrs(server.Datacenter)
for _, resolver := range s.resolvers {
@ -168,11 +177,19 @@ func DCPrefix(datacenter, suffix string) string {
}
// RemoveServer updates the resolvers' states with the given server removed.
func (s *ServerResolverBuilder) RemoveServer(server *metadata.Server) {
func (s *ServerResolverBuilder) RemoveServer(areaID types.AreaID, server *metadata.Server) {
s.lock.Lock()
defer s.lock.Unlock()
delete(s.servers, uniqueID(server))
areaServers, ok := s.servers[areaID]
if !ok {
return // already gone
}
delete(areaServers, uniqueID(server))
if len(areaServers) == 0 {
delete(s.servers, areaID)
}
addrs := s.getDCAddrs(server.Datacenter)
for _, resolver := range s.resolvers {
@ -185,12 +202,22 @@ func (s *ServerResolverBuilder) RemoveServer(server *metadata.Server) {
// getDCAddrs returns a list of the server addresses for the given datacenter.
// This method requires that lock is held for reads.
func (s *ServerResolverBuilder) getDCAddrs(dc string) []resolver.Address {
var addrs []resolver.Address
for _, server := range s.servers {
var (
addrs []resolver.Address
keptServerIDs = make(map[string]struct{})
)
for _, areaServers := range s.servers {
for _, server := range areaServers {
if server.Datacenter != dc {
continue
}
// Servers may be part of multiple areas, so only include each one once.
if _, ok := keptServerIDs[server.ID]; ok {
continue
}
keptServerIDs[server.ID] = struct{}{}
addrs = append(addrs, resolver.Address{
// NOTE: the address persisted here is only dialable using our custom dialer
Addr: DCPrefix(server.Datacenter, server.Addr.String()),
@ -198,6 +225,7 @@ func (s *ServerResolverBuilder) getDCAddrs(dc string) []resolver.Address {
ServerName: server.Name,
})
}
}
return addrs
}

View File

@ -1,13 +1,16 @@
package router
import "github.com/hashicorp/consul/agent/metadata"
import (
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/types"
)
// ServerTracker is called when Router is notified of a server being added or
// removed.
type ServerTracker interface {
NewRebalancer(dc string) func()
AddServer(*metadata.Server)
RemoveServer(*metadata.Server)
AddServer(types.AreaID, *metadata.Server)
RemoveServer(types.AreaID, *metadata.Server)
}
// Rebalancer is called periodically to re-order the servers so that the load on the
@ -24,7 +27,7 @@ func (NoOpServerTracker) NewRebalancer(string) func() {
}
// AddServer does nothing
func (NoOpServerTracker) AddServer(*metadata.Server) {}
func (NoOpServerTracker) AddServer(types.AreaID, *metadata.Server) {}
// RemoveServer does nothing
func (NoOpServerTracker) RemoveServer(*metadata.Server) {}
func (NoOpServerTracker) RemoveServer(types.AreaID, *metadata.Server) {}

View File

@ -175,7 +175,7 @@ func (r *Router) AddArea(areaID types.AreaID, cluster RouterSerfCluster, pinger
continue
}
if err := r.addServer(area, parts); err != nil {
if err := r.addServer(areaID, area, parts); err != nil {
return fmt.Errorf("failed to add server %q to area %q: %v", m.Name, areaID, err)
}
}
@ -276,7 +276,7 @@ func (r *Router) maybeInitializeManager(area *areaInfo, dc string) *Manager {
}
// addServer does the work of AddServer once the write lock is held.
func (r *Router) addServer(area *areaInfo, s *metadata.Server) error {
func (r *Router) addServer(areaID types.AreaID, area *areaInfo, s *metadata.Server) error {
// Make the manager on the fly if this is the first we've seen of it,
// and add it to the index.
manager := r.maybeInitializeManager(area, s.Datacenter)
@ -288,7 +288,7 @@ func (r *Router) addServer(area *areaInfo, s *metadata.Server) error {
}
manager.AddServer(s)
r.grpcServerTracker.AddServer(s)
r.grpcServerTracker.AddServer(areaID, s)
return nil
}
@ -302,7 +302,7 @@ func (r *Router) AddServer(areaID types.AreaID, s *metadata.Server) error {
if !ok {
return fmt.Errorf("area ID %q does not exist", areaID)
}
return r.addServer(area, s)
return r.addServer(areaID, area, s)
}
// RemoveServer should be called whenever a server is removed from an area. This
@ -324,7 +324,7 @@ func (r *Router) RemoveServer(areaID types.AreaID, s *metadata.Server) error {
return nil
}
info.manager.RemoveServer(s)
r.grpcServerTracker.RemoveServer(s)
r.grpcServerTracker.RemoveServer(areaID, s)
// If this manager is empty then remove it so we don't accumulate cruft
// and waste time during request routing.